def setup_error_logging(logfile, level_console="DEBUG", level_logfile="DEBUG"): """ Sets up error logging, and logs a number of system settings. Parameters ---------- logfile : str Path to output logfile. If size exceeds limit set below in JSON settings, path.1, path.2 etc will be created. level_console : str Logging level for printing to console. DEBUG, WARNING or CRITICAL level_logfile : str Logging level for printing to logfile. DEBUG, WARNING or CRITICAL Returns ------- logging : logging.Logger Logging object, for printing to console and logfile """ # load the log settings in json format logsettings = json.dumps({ "handlers": { "console": { "formatter": "brief", "class": "logging.StreamHandler", "stream": "ext://sys.stdout", "level": "DEBUG" }, "file": { "maxBytes": 10000000, "formatter": "precise", "backupCount": 3, "class": "logging.handlers.RotatingFileHandler", "level": "DEBUG", "filename": "logfile.txt" } }, "version": 1, "root": { "handlers": [ "console", "file" ], "propagate": "no", "level": "DEBUG" }, "formatters": { "simple": { "format": "format=%(asctime)s - %(name)s - %(levelname)s - %(message)s" }, "precise": { "format": "%(asctime)s %(name)-15s %(levelname)-8s %(message)s" }, "brief": { "format": "%(message)s" } } }, skipkeys=True, sort_keys=True, indent=4, separators=(',', ': ')) config=json.loads(logsettings) # add user parameters to the logging settings (logfile, and logging levels) config['handlers']['file']['filename'] = logfile config['handlers']['console']['level'] = level_console config['handlers']['file']['level'] = level_logfile # create folder if necessary utils.make_sure_path_exists(logfile, isfile=True) #create a blank logging file with open(logfile, 'w') as f: pass #clear any previous logging handlers that might have been previously run in the console logging.getLogger('').handlers = [] #load the logging settings from the modified json string logging.config.dictConfig(config) # collect a number of system settings that could be useful for troubleshooting system_settings_dict = {} system_settings_dict["system description"] = platform.uname() system_settings_dict["system"] = platform.system() system_settings_dict["architecture"] = platform.architecture() system_settings_dict["network_name"] = platform.node() system_settings_dict["release"] = platform.release() system_settings_dict["version"] = platform.version() system_settings_dict["machine"] = platform.machine() system_settings_dict["processor"] = platform.processor() system_settings_dict["python_version"] = platform.python_version() system_settings_dict["python_build"] = platform.python_build() system_settings_dict["python_compiler"] = platform.python_compiler() system_settings_dict["argv"] = sys.argv system_settings_dict["dirname(argv[0])"] = os.path.abspath(os.path.expanduser(os.path.dirname(sys.argv[0]))) system_settings_dict["pwd"] = os.path.abspath(os.path.expanduser(os.path.curdir)) system_settings_dict["total_ram"] = "{:0.2f} GB".format(psutil.virtual_memory()[0] / 1000000000) system_settings_dict["available_ram"] = "{:0.2f} GB ({}% used)".format(psutil.virtual_memory()[1] / 1000000000, psutil.virtual_memory()[2]) # log the system settings logging.warning("system description : {}".format(system_settings_dict)) #test error message reporting #logging.warning('LOGGING TEST:') #try: # open('/path/to/does/not/exist', 'rb') #except (SystemExit, KeyboardInterrupt): # raise #except Exception: # logging.error('Failed to open file', exc_info=True) logging.warning('logging setup is successful (logging levels: console={}, logfile={}). \n'.format(level_console, level_logfile)) return logging
def slice_TMD_1_prot_from_homol(p): """ Slices TMDs from homologues, for a single protein in the list. - checks that the homol_df_orig_zip file exists with the full homologue sequences - if slice_juxtamembrane_regions is chosen, conducts JM slicing (currently not stable) - removes any old, existing fa_cr_sliced_TMDs_zip files - creates df_nonTMD_sliced Dataframe to hold the sliced nonTMD regions, based on the indices from all the regex searches - for each TMD: - identifies and slices out the TMD region from the query, markup and match from each SW alignment - the dataframe with sliced sequences for each TMD is added to the fa_cr_sliced_TMDs_zip as PROTEIN_NAME_TM01_sliced_df.pickle, PROTEIN_NAME_TM02_sliced_df.pickle, etc - adds the indices for each TMD to the df_nonTMD_sliced Dataframe - when each TMD is finished : df_nonTMD_sliced uses the indices for each TMD to create the indices for the full nonTMD region (korbinian.cons_ratio.slice.slice_nonTMD_seqs) df_nonTMD_sliced contains nonTMD region, as one large slice of query, markup or match from alignment, pieces joined end-to-end df_nonTMD_sliced is saved in fa_cr_sliced_TMDs_zip as PROTEIN_NAME_nonTMD_sliced_df.pickle Parameters ---------- p : dict Protein Dictionary. Contains all input settings, sequences and filepaths related to a single protein. Protein-specific data is extracted from one row of the the list summary, e.g. List05_summary.csv, which is read as df. p also contains the GENERAL korbinian settings and filepaths for that list (pathdict, s, logging) Components ---------- pathdict : dict Dictionary of the key paths and files associated with that List number. s : dict Settings dictionary extracted from excel settings file. logging : logging.Logger Logger for printing to console and/or logfile. If multiprocessing == True, logging.info etc will only print to console. p : protein-specific dictionary components acc, list_of_TMDs, description, TM01_seq, etc Dataframes ---------- dfs Dataframe for Sequences index = hit_num columns = md5, FASTA_gapped_identity, hit_contains_SW_node, organism, X_in_match_seq, disallowed_words_not_in_descr, etc df_TMD Dataframe for 1 TMD, from 1 protein index = hit_num columns = 'organism', 'description', 'TM01_in_SW_alignment', 'TM01_start_in_SW_alignment', 'TM01_end_in_SW_alignment', 'TM01_SW_query_seq', 'TM01_SW_markup_seq', 'TM01_SW_match_seq', etc df_nonTMD_sliced Dataframe for all nonTMD region, from 1 protein index = hit_num columns = 'nested_tuple_indices_all_nonTMD_regions', 'nonTMD_markup', 'nonTMD_seq_match', 'len_query_align_seq', 'TM01_in_SW_alignment', 'TM01_start_in_SW_alignment', 'TM01_end_in_SW_alignment', 'TM02_in_SW_alignment', etc Saved Files and Figures ----------------------- fa_cr_sliced_TMDs_zip df_nonTMD_temp_pickle, e.g. A4ARX1_nonTMD_sliced_df.pickle TM_temp_pickle, E.g. A4ARX1_TM01_sliced_df.pickle Returns ------- In all cases, a tuple (str, bool, str) is returned. if successful: return acc, True, "0" if not successful: return acc, False, "specific warning or reason why protein failed" """ pathdict, s, logging = p["pathdict"], p["s"], p["logging"] acc = p["acc"] sys.stdout.write("{} ".format(acc)) sys.stdout.flush() protein_name = p['protein_name'] if not os.path.exists(p['homol_df_orig_zip']): warning = "{} Protein skipped. File does not exist".format( p['homol_df_orig_zip']) logging.info(warning) return acc, False, warning if utils.file_is_old(p['homol_df_orig_zip'], s["oldest_acceptable_file_date"]): os.remove(p['homol_df_orig_zip']), message = "{} skipped, file is old and has been deleted".format(acc) logging.info(message) return acc, False, message dfs = utils.open_df_from_pickle_zip(p['homol_df_orig_zip'], delete_corrupt=True) if dfs.empty: warning = "{} Protein skipped, file deleted as it is possibly corrupt.".format( p['homol_df_orig_zip']) logging.info(warning) return acc, False, warning list_of_TMDs = ast.literal_eval(p['list_of_TMDs']) # create a boolean "p_is_multipass" to show whether protein is multipass (>1TMD) or singlepass (1TMD) if "TM02" in list_of_TMDs: p_is_multipass = True else: p_is_multipass = False # create counter for number of TMDs with some homologue data n_TMDs_w_homol = 0 fa_cr_sliced_TMDs_zip = p['fa_cr_sliced_TMDs_zip'] if os.path.isfile(fa_cr_sliced_TMDs_zip): if s["overwrite_sliced_homologues"] == True: # delete any existing sliced zipfile os.remove(fa_cr_sliced_TMDs_zip) else: warning = "{} skipped, output from slice_TMD_1_prot_from_homol already exists".format( acc) logging.info(warning) # skip this protein return acc, False, warning utils.make_sure_path_exists(fa_cr_sliced_TMDs_zip, isfile=True) # open new zipfile (NOTE, it must be closed later!!) with zipfile.ZipFile(fa_cr_sliced_TMDs_zip, mode="a", compression=zipfile.ZIP_DEFLATED) as homol_sliced_zip: # get directory for zip (and other temp files to be transferred) homol_dir = os.path.dirname(fa_cr_sliced_TMDs_zip) # create a specific dataframe to hold the nonTMD region, including indices (True, start, end) of all the TMD segments if "len_query_align_seq" not in dfs.columns: warning = "{} len_query_align_seq not in columns, protein skipped for slice_TMD_1_prot_from_homol".format( acc) logging.warning(warning) #skip protein return acc, False, warning # add the FASTA_gapped_identity and length of the alignment sequence from dfs, to act as the "end" of all the nonTMD regions df_nonTMD_sliced = dfs[['len_query_align_seq', 'SW_query_coverage']].copy() # start with an empty dataframe, that will be replaced if there is any data to analyse df_TMD = pd.DataFrame() for TMD in list_of_TMDs: query_TMD_sequence = p['%s_seq' % TMD] if type(query_TMD_sequence) == float: warning = "{} {} query_TMD_sequence is np.nan! skipping protein.".format( acc, TMD) logging.warning(warning) # skip protein return acc, False, warning ## SHOULD NOT BE NECESSARY. OMPdb DATABASE NOW FIXED TO AVOID NAN VALUES IN TM_SEQ # if isinstance(query_TMD_sequence, float): # warning = "{} {} query_TMD_sequence is a float ({}), probably np.nan.".format(acc, TMD, query_TMD_sequence) # logging.warning(warning) # return acc, False, warning df_TMD = korbinian.cons_ratio.slice.slice_1_TMD_from_homol( acc, TMD, query_TMD_sequence, dfs, s, logging) if df_TMD.empty: warning = "{} {} df_TMD.empty, probably number_of_rows_containing_data == 0".format( acc, TMD, query_TMD_sequence) logging.warning(warning) # skip TMD, as number_of_rows_containing_data == 0 # here I really should skip the protein too. It's tempting to use goto :). "from goto import goto" (http://entrian.com/goto/) return acc, False, warning n_TMDs_w_homol += 1 # transfer the columns with indices across to the df_nonTMD_sliced cols = [ '%s_in_SW_alignment' % TMD, '%s_start_in_SW_alignment' % TMD, '%s_end_in_SW_alignment' % TMD ] for col in cols: df_nonTMD_sliced[col] = df_TMD[col] TM_temp_pickle = os.path.join( homol_dir, "{}_{}_sliced_df.pickle".format(protein_name, TMD)) with open(TM_temp_pickle, "wb") as f: pickle.dump(df_TMD, f, protocol=pickle.HIGHEST_PROTOCOL) homol_sliced_zip.write(TM_temp_pickle, arcname=os.path.basename(TM_temp_pickle)) os.remove(TM_temp_pickle) sys.stdout.write(".") sys.stdout.flush() if df_TMD.empty: # skip protein, as number_of_rows_containing_data == 0 for at least one TMD (or at least the last TMD) warning = "{} skipped, number_of_rows_containing_data == 0 for at least one TMD".format( acc) logging.info(warning) return acc, False, warning df_nonTMD_sliced = korbinian.cons_ratio.slice.slice_nonTMD_seqs( dfs, df_nonTMD_sliced, list_of_TMDs) if df_nonTMD_sliced.empty: warning = "{} df_nonTMD_sliced is empty, probably this means no homologues contain all TMDs".format( acc) logging.warning(warning) #skip protein return acc, False, warning if s["slice_juxtamembrane_regions"] == True: for TMD in list_of_TMDs: ######################################################################################## # # # Define juxtamembrane regions associated with each TMD [AAIMON] # # # ######################################################################################## # convert the tuple of (True, 32, 53) into separate dataframes. # http://stackoverflow.com/questions/29550414/how-to-split-column-of-tuples-in-pandas-dataframe if p_is_multipass: next_TMD = "TM{:02d}".format(int(TMD[2:]) + 1) prev_TMD = "TM{:02d}".format(int(TMD[2:]) - 1) #df_next_TMD = df_TMD = korbinian.cons_ratio.slice.slice_1_TMD_from_homol(acc, next_TMD, query_TMD_sequence, dfs, s, logging) #if TMD != "TM01": # df_prev_TMD = df_TMD = korbinian.cons_ratio.slice.slice_1_TMD_from_homol(acc, prev_TMD, query_TMD_sequence, dfs, s, logging) last_TMD_of_acc = list_of_TMDs[-1] if TMD == "TM01": # np.where syntax: np.where(boolean_query, value_if_query_true, value_if_query_false) # @RJ, If TM01_start_in_SW_alignment is not an integer above 0, replaces with np.nan? df_nonTMD_sliced['start_juxta_before_TM01'] = np.where( df_nonTMD_sliced['TM01_start_in_SW_alignment'] > 0, 0, np.nan) # if the TM01_start_in_SW_alignment is 0, there is no JM region N-terminal to the TMD, therefore replace end_juxta_before_TM01 with np.nan, otherwise use TM01_start_in_SW_alignment df_nonTMD_sliced['end_juxta_before_TM01'] = np.where( df_nonTMD_sliced['TM01_start_in_SW_alignment'] == 0, np.nan, df_nonTMD_sliced['TM01_start_in_SW_alignment']) # set the start of the juxta as the end of the TMD df_nonTMD_sliced[ 'start_juxta_after_TM01'] = df_nonTMD_sliced[ 'TM01_end_in_SW_alignment'] # if there is only one TMD (search for TM02 rather than measuring length of list, in case of signal peptides) if p_is_multipass: # open up the dataframes of the next and previous TMD # define the end_juxta_after_TM01 as the TM01 end + half of the TM01_to_TM02 JM region # NOTE, due to np.nan this is a float. will be converted to integers later df_nonTMD_sliced[ 'end_juxta_after_TM01'] = df_nonTMD_sliced[ "TM01_end_in_SW_alignment"] + ( (df_nonTMD_sliced[ "TM02_start_in_SW_alignment"] - df_nonTMD_sliced[ "TM01_end_in_SW_alignment"]) / 2) # RJ original ## problem('df_nonTMD_sliced["TM02_start_in_SW_alignment"] cannot exist yet, because the script iterates through the TMDs one at a time') # df_nonTMD_sliced['end_juxta_after_TM01'] = df_nonTMD_sliced["TM01_end_in_SW_alignment"] + ((df_nonTMD_sliced["TM02_start_in_SW_alignment"] - df_nonTMD_sliced["TM01_end_in_SW_alignment"]) / 2).apply(lambda x: int(x) if not np.isnan(x) else np.nan) # RJ commented out # df_nonTMD_sliced['seq_juxta_after_TM01_in_query'] = df_nonTMD_sliced[df_nonTMD_sliced['start_juxta_after_TM01'].notnull()].apply(utils.slice_juxta_after_TMD_in_query, args = (TMD,), axis=1) # df_nonTMD_sliced['seq_juxta_after_TM01_in_match'] = df_nonTMD_sliced[df_nonTMD_sliced['end_juxta_after_TM01'].notnull()].apply(utils.slice_juxta_after_TMD_in_match, args = (TMD,), axis=1) else: # if there is only one TMD, TM01 == last_TMD_of_acc # @RJ replace with df_nonTMD_sliced['end_juxta_after_TM01'] = df_nonTMD_sliced['len_query_align_seq'] and use dropna to avoid nans later? df_nonTMD_sliced['end_juxta_after_TM01'] = np.where( utils.isNaN( df_nonTMD_sliced['start_juxta_after_TM01']) == True, np.nan, df_nonTMD_sliced['len_query_align_seq']) # the analysis is slow, so don't repeat TM01 if there is only one TM helix in the protein if p_is_multipass: if not TMD == "TM01" and not TMD == last_TMD_of_acc: df_nonTMD_sliced = juxta_function_1( df_nonTMD_sliced, TMD) # df_nonTMD_sliced['start_juxta_after_%s'%TMD] = np.where(utils.isNaN(df_nonTMD_sliced['TM%.2d_start_in_SW_alignment'%(int(TMD[2:])+1)])==True,np.nan,df_nonTMD_sliced['%s_end_in_SW_alignment'%TMD]) # df_nonTMD_sliced['end_juxta_before_%s'%TMD] = np.where(df_nonTMD_sliced["%s_start_in_SW_alignment"%TMD]!=0,df_nonTMD_sliced["%s_start_in_SW_alignment"%TMD],np.nan) # df_nonTMD_sliced['end_juxta_after_%s'%TMD] = df_nonTMD_sliced["%s_end_in_SW_alignment"%TMD]+((df_nonTMD_sliced["TM%.2d_start_in_SW_alignment"%(int(TMD[2:])+1)]-df_nonTMD_sliced["%s_end_in_SW_alignment"%TMD])/2).apply(lambda x :int(x) if not np.isnan(x) else np.nan) # df_nonTMD_sliced['start_juxta_before_%s'%TMD] = np.where(df_nonTMD_sliced["end_juxta_after_TM%.2d"%(int(TMD[2:])-1)] == df_nonTMD_sliced['end_juxta_before_%s'%TMD] ,df_nonTMD_sliced["end_juxta_after_TM%.2d"%(int(TMD[2:])-1)],df_nonTMD_sliced["end_juxta_after_TM%.2d"%(int(TMD[2:])-1)]) # df_nonTMD_sliced['seq_juxta_after_%s_in_query'%TMD] = df_nonTMD_sliced[df_nonTMD_sliced['start_juxta_after_%s'%TMD].notnull()].apply(utils.slice_juxta_after_TMD_in_query, args = (TMD,), axis=1) # df_nonTMD_sliced['seq_juxta_after_%s_in_match'%TMD] = df_nonTMD_sliced[df_nonTMD_sliced['end_juxta_after_%s'%TMD].notnull()].apply(utils.slice_juxta_after_TMD_in_match, args = (TMD,), axis=1) if TMD == last_TMD_of_acc: df_nonTMD_sliced['start_juxta_before_%s' % TMD] = df_nonTMD_sliced[ 'end_juxta_after_%s' % prev_TMD] df_nonTMD_sliced['end_juxta_before_%s' % TMD] = df_nonTMD_sliced[ '%s_start_in_SW_alignment' % TMD] df_nonTMD_sliced[ 'start_juxta_after_%s' % TMD] = np.where( df_nonTMD_sliced['%s_end_in_SW_alignment' % TMD] == df_nonTMD_sliced['len_query_align_seq'], np.nan, df_nonTMD_sliced['%s_end_in_SW_alignment' % TMD]) df_nonTMD_sliced[ 'end_juxta_after_%s' % TMD] = np.where( utils.isNaN( df_nonTMD_sliced['start_juxta_after_%s' % TMD]) == True, np.nan, df_nonTMD_sliced['len_query_align_seq']) # df_nonTMD_sliced['seq_juxta_after_%s_in_query'%TMD] = df_nonTMD_sliced[df_nonTMD_sliced['start_juxta_after_%s'%TMD].notnull()].apply(utils.slice_juxta_after_TMD_in_query, args = (TMD,), axis=1) # df_nonTMD_sliced['seq_juxta_after_%s_in_query'%TMD] = df_nonTMD_sliced.query_align_seq[int(df_nonTMD_sliced['start_juxta_after_TM10']):int(df_nonTMD_sliced['end_juxta_after_TM10'])] # df_nonTMD_sliced['seq_juxta_after_%s_in_match'%TMD] = else: # the end_juxta_after_TM01 is already defined, nothing else needs to be done for the single-pass proteins pass last_TMD_of_acc = list_of_TMDs[-1] index_juxta = df_nonTMD_sliced['start_juxta_before_%s' % TMD].notnull().index q = np.array(dfs.loc[index_juxta, "query_align_seq"]) st = np.array(df_nonTMD_sliced.loc[index_juxta, 'start_juxta_before_%s' % TMD]) st = st.astype(int) en = np.array(df_nonTMD_sliced.loc[index_juxta, 'end_juxta_before_%s' % TMD]) en = en.astype(int) q_sliced = [q[i][st[i]:en[i]] for i in range(len(q))] df_nonTMD_sliced['seq_juxta_before_%s_in_query' % TMD] = pd.Series(q_sliced, index=index_juxta) m = np.array(dfs.loc[index_juxta, "match_align_seq"]) m_sliced = [m[i][st[i]:en[i]] for i in range(len(m))] df_nonTMD_sliced['seq_juxta_before_%s_in_match' % TMD] = pd.Series(m_sliced, index=index_juxta) #df_nonTMD_sliced['seq_juxta_before_%s_in_query' % TMD] = df_nonTMD_sliced[df_nonTMD_sliced['start_juxta_before_%s' % TMD].notnull()].apply(utils.slice_juxta_before_TMD_in_query, args=(TMD,), axis=1) #df_nonTMD_sliced['seq_juxta_before_%s_in_match' % TMD] = df_nonTMD_sliced[df_nonTMD_sliced['start_juxta_before_%s' % TMD].notnull()].apply(utils.slice_juxta_before_TMD_in_match, args=(TMD,), axis=1) if not TMD == last_TMD_of_acc: index_juxta = df_nonTMD_sliced['end_juxta_after_%s' % TMD].notnull().index st = np.array(df_nonTMD_sliced.loc[index_juxta, 'start_juxta_after_%s' % TMD]) st = st.astype(int) en = np.array(df_nonTMD_sliced.loc[index_juxta, 'end_juxta_after_%s' % TMD]) en = en.astype(int) q_sliced = [q[i][st[i]:en[i]] for i in range(len(q))] df_nonTMD_sliced['seq_juxta_after_%s_in_query' % TMD] = pd.Series(q_sliced, index=index_juxta) m_sliced = [m[i][st[i]:en[i]] for i in range(len(m))] df_nonTMD_sliced['seq_juxta_after_%s_in_match' % TMD] = pd.Series(m_sliced, index=index_juxta) #df_nonTMD_sliced['seq_juxta_after_%s_in_query' % TMD] = df_nonTMD_sliced[df_nonTMD_sliced['end_juxta_after_%s' % TMD].notnull()].apply(utils.slice_juxta_after_TMD_in_query, args=(TMD,), axis=1) #df_nonTMD_sliced['seq_juxta_after_%s_in_match' % TMD] = df_nonTMD_sliced[df_nonTMD_sliced['end_juxta_after_%s' % TMD].notnull()].apply(utils.slice_juxta_after_TMD_in_match, args=(TMD,), axis=1) else: index_juxta = df_nonTMD_sliced['start_juxta_after_%s' % TMD].notnull().index st = np.array(df_nonTMD_sliced.loc[index_juxta, 'start_juxta_after_%s' % TMD]) st = st.astype(int) en = np.array(df_nonTMD_sliced.loc[index_juxta, 'end_juxta_after_%s' % TMD]) en = en.astype(int) q_sliced = [q[i][st[i]:en[i]] for i in range(len(q))] df_nonTMD_sliced['seq_juxta_after_%s_in_query' % TMD] = pd.Series(q_sliced, index=index_juxta) m_sliced = [m[i][st[i]:en[i]] for i in range(len(m))] df_nonTMD_sliced['seq_juxta_after_%s_in_match' % TMD] = pd.Series(m_sliced, index=index_juxta) # df_nonTMD_sliced['seq_juxta_after_%s_in_query' % TMD] = np.nan # df_nonTMD_sliced['seq_juxta_after_%s_in_match' % TMD] = np.nan # for hit in df_nonTMD_sliced.index: # if not utils.isNaN(df_nonTMD_sliced['start_juxta_after_%s' % TMD])[hit]: # # altered to .loc rather than ['seq_juxta_after_%s_in_match'%TMD][hit] after SettingWithCopyWarning # df_nonTMD_sliced.loc[hit, 'seq_juxta_after_%s_in_match' % TMD] = df_nonTMD_sliced.match_align_seq[hit][int(df_nonTMD_sliced.loc[hit, "start_juxta_after_%s" % TMD]):int(df_nonTMD_sliced.loc[hit, "end_juxta_after_%s" % TMD])] # df_nonTMD_sliced.loc[hit, 'seq_juxta_after_%s_in_query' % TMD] = df_nonTMD_sliced.query_align_seq[hit][int(df_nonTMD_sliced.loc[hit, "start_juxta_after_%s" % TMD]):int(df_nonTMD_sliced.loc[hit, "end_juxta_after_%s" % TMD])] df_nonTMD_temp_pickle = os.path.join( homol_dir, "{}_nonTMD_sliced_df.pickle".format(protein_name)) with open(df_nonTMD_temp_pickle, "wb") as f: pickle.dump(df_nonTMD_sliced, f, protocol=pickle.HIGHEST_PROTOCOL) homol_sliced_zip.write(df_nonTMD_temp_pickle, arcname=os.path.basename(df_nonTMD_temp_pickle)) os.remove(df_nonTMD_temp_pickle) return acc, True, "0"
def parse_SIMAP_to_csv(p): """ Parses the SIMAP XML file to csv for a single protein. Designed for use in multiprocessing, where logging.info will only print to the console, and the logfile will contain the messages in the return statements, telling if that protein was successful. Notes: - sdict is the dictionary with all the simap header info. It's not actually used anywhere further in the pipeline at the moment. Parameters ---------- p : dict Protein Dictionary. Contains all input settings, sequences and filepaths related to a single protein. Protein-specific data is extracted from one row of the the list summary, e.g. List05_summary.csv, which is read as df. p also contains the GENERAL korbinian settings and filepaths for that list (pathdict, s, logging) Components ---------- pathdict : dict Dictionary of the key paths and files associated with that List number. s : dict Settings dictionary extracted from excel settings file. logging : logging.Logger Logger for printing to console and/or logfile. If multiprocessing == True, logging.info etc will only print to console. p : protein-specific dictionary components acc, list_of_TMDs, description, TM01_seq, etc Saved Files and Figures ----------------------- homol_df_orig_zip : zipfile Zipfile containing the following: SIMAP_align_pretty_csv : csv CSV file containing the hit_number protein description and the pretty alignment for each homologue homol_df_orig_pickle : pickled pd.DataFrame Dataframe containing all sequence extracted from the XML file. This can be large, as it contains the full query, markup and match sequences Returns ------- In all cases, a tuple (str, bool, str) is returned. if sucsessful: return acc, True, "0" if not successful: return acc, False, "specific warning or reason why protein failed" """ pathdict, s, logging = p["pathdict"], p["s"], p["logging"] acc = p["acc"] sys.stdout.write("{}, ".format(acc)) sys.stdout.flush() protein_name = p['protein_name'] # if overwrite_simap_parsed_to_csv is False, skip proteins where the homol_df_orig_zip file seems good if s["overwrite_simap_parsed_to_csv"] == False: if os.path.isfile(p['homol_df_orig_zip']): try: # open up the csv as a dataframe. Delete the zip file if a csv is not found. dfh_test = utils.open_df_from_pickle_zip(p['homol_df_orig_zip'], filename=os.path.basename(p['homol_df_orig_pickle']), delete_corrupt=True) description_of_first_hit = dfh_test.loc[1, 'description'] logging.info('Protein %s: homologues already converted to csv. (%s)' % (p["acc"], description_of_first_hit)) # The file seems fine. Skip to next protein. warning = "{} skipped, homologues already parsed to csv".format(p['protein_name']) logging.info(warning) return acc, False, warning except (EOFError, KeyError): # file may be corrupted, if script stopped unexpectedly before compression was finished logging.info('%s seems to be corrupted. File will be deleted and parsing from xml to csv repeated.' % p['homol_df_orig_zip']) os.remove(p['homol_df_orig_zip']) #set up counters number_of_hits_missing_protein_node = 0 num_hits_with_SW_align_node = 0 number_of_hits_missing_smithWatermanAlignment_node = 0 ft_xml_path = p['SIMAP_feature_table_XML_path'] homol_xml_path = p['SIMAP_homol_XML_path'] SIMAP_tar = p['SIMAP_tar'] homol_xml_filename = os.path.basename(homol_xml_path) #check which files exist homol_in_tar = utils.check_SIMAP_tarfile(SIMAP_tar, ft_xml_path, homol_xml_path, acc, logging, delete_corrupt=True)[-1] # NEW: XML is parsed if only the homol_in_tar (feature tables are not necessary) if not homol_in_tar: warning = "{} skipped (no homologues)".format(p['protein_name']) logging.info(warning) return acc, False, warning # create subfolders, if they don't exist subfolder = os.path.dirname(p['homol_df_orig_zip']) utils.make_sure_path_exists(subfolder) #extract the tarfile so that it can be read as xml tar = tarfile.open(p['SIMAP_tar'], 'r:gz') SIMAP_homologues_XML_file_extracted = tar.extractfile(homol_xml_filename) try: #parse_uniprot the XML file with elementtree, define the 'root' of the XML file simap_homologue_tree = ET.parse(SIMAP_homologues_XML_file_extracted) simap_homologue_root = simap_homologue_tree.getroot() except xml.etree.ElementTree.ParseError: # returns a tuple message = "{} contains xml file that gives a ParseError. " \ "In the future, file may be automatically deleted.".format(p['homol_df_orig_zip']) logging.info(message) return acc, False, message try: error = simap_homologue_root[0][0][1][0].text if "could not find the query sequence" in error: # returns a tuple message = "{} not in simap database".format(acc) logging.info(message) return acc, False, message except IndexError: # file is probably normal, as it doesn't contain the message saying that the protein is not found in the database pass # the sdict is the dictionary of info at top of SIMAP XML, before the matches start # it will be saved in a separate csv sdict = {} try: sdict['SIMAP_created'] = simap_homologue_root[0][0][0][0][2][1][0].attrib["created"] for parameters in simap_homologue_root[0][0][0][0].iter('parameters'): sdict['SIMAP_input_seq_details_dict'] = str(parameters[0][0].attrib) for SIMAP_filter in parameters.iter('filter'): SIMAP_filter_string = SIMAP_filter.text sdict['SIMAP_filter_string'] = str(SIMAP_filter_string) for resultSpecification in parameters.iter('resultSpecification'): SIMAP_resultSpecification_dict = resultSpecification.attrib sdict['SIMAP_resultSpecification_dict'] = '"%s"' % SIMAP_resultSpecification_dict for databases in parameters.iter('databases'): database_details_dict = databases[0].attrib sdict['database_details_dict'] = '"%s"' % database_details_dict sdict['simap_version'] = simap_homologue_root[0][0][0][0][0].attrib['version'] sdict['SIMAP_total_hits'] = int(simap_homologue_root[0][0][0][1][0].attrib['total']) if sdict['simap_version'] != '4.0': logging.warning('WARNING! Your XML file is simap version %s,' 'however this SIMAP parser was developed for SIMAP version 4.0.' % sdict['simap_version']) query_sequence_node = simap_homologue_root[0][0][0][0][2][0][0] ''' xxxx CURRENTLY THE df is filled with nan values, but that doesn't make sense as the script seems to work ''' sdict['query_md5'] = query_sequence_node.attrib['md5'] sdict['seqlen'] = int(query_sequence_node.attrib['length']) sdict['query_selfscore'] = query_sequence_node.attrib['selfscore'] sdict['query_sequenceid'] = query_sequence_node.attrib['sequenceid'] sdict['total_number_of_simap_hits'] = query_sequence_node[0].attrib['number_hits'] sdict['query_sequence_from_homologue_XML_file'] = query_sequence_node[0][0].text sdict['number_of_hits_in_homologue_XML_file'] = int(simap_homologue_root[0][0][0][1][0].attrib['total']) except (IndexError, KeyError): warning = "{} skipped, homologue XML seems to be damaged. Error in reading general query details.".format(protein_name) logging.warning("{} skipped, homologue XML seems to be damaged. Error in reading general query details.".format(protein_name)) # skip to the next protein return acc, False, warning if p['full_seq'].upper() != sdict['query_sequence_from_homologue_XML_file'].upper(): logging.warning("...............................\n" "{} WARNING: Mismatch between full_seq and SIMAP seq from XML file. Tarball with SIMAP XML is probably old and should be deleted.\n" "full_seq : {}\n" "XML_seq : {}\n" "Tarball : {}\n" "acc has been added to mismatch_full_seq_with_simap_txt\n" "...............................\n".format(acc, p['full_seq'].upper(),sdict['query_sequence_from_homologue_XML_file'].upper(), p['SIMAP_tar'])) # add accession number to the list of acc with a sequence mismatch mismatch_full_seq_with_simap_list = utils.get_acc_list_from_txt(pathdict["mismatch_full_seq_with_simap_txt"]) if acc not in mismatch_full_seq_with_simap_list: with open(pathdict["mismatch_full_seq_with_simap_txt"], "a") as source: source.write("\n{}".format(acc)) #for each hit, save all the relevant data in the form of a dictionary, # so it can be added to a csv file or used in other calculations simap_homologue_hits = simap_homologue_root[0][0][0][1][0] #see if there are any hits at all try: test2 = simap_homologue_root[0][0][0][1][0][0] except IndexError: warning = "{} skipped, homologue XML has no hits.".format(protein_name) logging.warning(warning) # skip to the next protein return acc, False, warning """OLD AMINO ACID SUBSTITUTION CODE. THIS IS SLOW, AND GIVES NO SIGNIFICANT DIFFERENCE TO AAIMON OR AASMON WITH THE SIMAP SMITH-WATERMAN MATRIX""" #load the amino acid substitution matrices from the settings file #list_of_aa_sub_matrices = s['aa_sub_matrices'] #import the amino acid substitution matrices #utils.import_amino_acid_substitution_matrices() #add the similarity ratios to the csv_header_for_SIMAP_homologue_file. # These will depend on the individual settings # if s['["mp_calculate_TMD_conservation_with_aa_matrices']: # for j in range(s["gap_open_penalty_min"], # s["gap_open_penalty_max"], # s["gap_open_penalty_increment"]): # gap_open_penalty = j # gap_extension_penalty = j # for matrix_name in list_of_aa_sub_matrices: # column_name = 'sim_ratio_%s_gapo%i' % (matrix_name.replace("'", "")[0:-7], j) # csv_header_for_SIMAP_homologue_file.append(column_name) #import the necessary matrices #for matrix_name in list_of_aa_sub_matrices: #matrix = matrix_name[0:-7] #from Bio.SubsMat.MatrixInfo import matrix as matrix_name SIMAP_orig_csv = p['homol_df_orig_zip'][:-4] + ".csv" #fasta_file_path = p['fasta_file_path'] #create an empty file open(SIMAP_orig_csv, 'w').close() #reopen to add match details iteratively from dictionary csvfile = open(SIMAP_orig_csv, 'a') #set up a bool to catch those files where not a single hit actually gives data at_least_one_hit_contains_SW_node = False for hit in simap_homologue_hits: match_details_dict = {} #add desired hit information to the dictionary for transfer to csv hit_num = int(hit.attrib['number']) match_details_dict['hit_num'] = hit_num match_details_dict['md5'] = hit[1].attrib['md5'] #define the major nodes in the XML-file try: protein_node = hit[1][1] hit_contains_protein_node = True except IndexError: hit_contains_protein_node = False number_of_hits_missing_protein_node += 1 logging.warning('%s hit %s contains no protein node' % (protein_name, match_details_dict['md5'])) if not hit_contains_protein_node: #skip to next hit continue try: smithWatermanAlignment_node = hit[0][0][14] hit_contains_SW_node = True num_hits_with_SW_align_node += 1 except IndexError: hit_contains_SW_node = False match_details_dict['hit_contains_SW_node'] = hit_contains_SW_node #add the description. Add a custom name if it is the first (query) hit if hit_num == 1: description = '%s_SIMAP_query_sequence' % protein_name else: description = protein_node.attrib['description'] match_details_dict['description'] = description try: databaseId = int(protein_node[1].attrib['databaseId']) match_details_dict['databaseId'] = int(protein_node[1].attrib['databaseId']) except KeyError: databaseId = 0 #match_details_dict['databaseId'] = int(0) #databaseId = int(protein_node[1].attrib['databaseId']) databasenode = protein_node[1] match_details_dict['database'] = databasenode.attrib['name'] try: taxonomyNode = protein_node[2] match_details_dict['organism'] = taxonomyNode.attrib['name'] match_details_dict['taxonomy_node_id'] = taxonomyNode.attrib['node_id'] match_details_dict['taxonomy_rank'] = taxonomyNode.attrib['rank'] except IndexError: #sequence is from an unknown organism, as it has no database node match_details_dict['description'] += ', no_database_node' match_details_dict['organism'] = 'no_database_node' match_details_dict['taxonomy_node_id'] = 'no_database_node' match_details_dict['taxonomy_rank'] = 'no_database_node' match_details_dict['len_full_match_seq'] = len(hit[1][0][0].text) #len_full_match_seq = len(full_match_seq) alignment_node = hit[0][0] #E-value for hit match_details_dict['FASTA_expectation'] = float(alignment_node[1].text) #convert identity from e.g. 80 (80%) to 0.8 match_details_dict['FASTA_identity'] = float(alignment_node[3].text) / 100 #strangely, I think gappedIdentity is the identity EXCLUDING gaps, which is a better value to base judgements on. convert identity from e.g. 80 (80%) to 0.8 match_details_dict['FASTA_gapped_identity'] = float(alignment_node[4].text) / 100 # creating the real observed changes from FASTA_gapped_identity - this is a percentage value now!!! match_details_dict['obs_changes'] = 100 - float(alignment_node[4].text) '''xxx notes on the gapped identity N.B The FASTA_gapped_identity data here is from the FASTA algorithm, that precedes the SW algorithm. Occasionally they don’t match!!! I calculate the TMD identity manually from the SW alignment, BUT currently for the calculation of membranous/nonmembranous I use the gappedIdentity from the FASTA output (the SW output inly has identity including gaps) - if I counted the gaps from the SW alignment, I COULD recalculate the gappedIdentity for the SW alignment - OR: I could simply remove the data where the FASTA and SW don’t match. ''' #FASTA overlap should be the length of the aligned region after running the FASTA algorithm (alignment is not shown by SIMAP) match_details_dict['FASTA_overlap'] = int(alignment_node[5].text) match_details_dict['FASTA_query_coverage'] = float(alignment_node[11].text) match_details_dict['FASTA_match_coverage'] = float(alignment_node[12].text) #find the start and the stop of the hsp querySeq = alignment_node[6] match_details_dict['FASTA_query_start'] = int(querySeq.attrib['start']) match_details_dict['FASTA_query_end'] = int(querySeq.attrib['end']) matchSeq = alignment_node[7] match_details_dict['FASTA_match_start'] = int(matchSeq.attrib['start']) match_details_dict['FASTA_match_end'] = int(matchSeq.attrib['end']) """OLD CALCULATIONS THAT ARE NOW CONVERTED TO PANDAS ARRAY-WISE FUNCTIONS""" #some parameters that are needed for identity calculations later #FASTA_num_ident_res = FASTA_identity / 100.0 * FASTA_overlap #is_start_of_TMD_in_FASTA = True if FASTA_query_start <= TMDstart else False #is_end_of_TMD_in_FASTA = True if TMDend <= FASTA_query_end else False #is_TMD_in_FASTA_alignment = True if all([is_start_of_TMD_in_FASTA, is_end_of_TMD_in_FASTA]) else False '''***********************if the TMD region is actually covered by the hsp, then conduct some further analyses of the match TMD region*************************''' if hit_contains_SW_node: query_align_seq = '' '''For the moment, there is no need to put the whole match hsp sequence into the csv file''' #for smithWatermanAlignment in alignment_node.iter('smithWatermanAlignment'): match_details_dict['SW_query_score_ratio'] = smithWatermanAlignment_node[0].text match_details_dict['SW_match_score_ratio'] = smithWatermanAlignment_node[1].text match_details_dict['SW_query_coverage'] = smithWatermanAlignment_node[2].text match_details_dict['SW_match_coverage'] = smithWatermanAlignment_node[3].text match_details_dict['SW_coverage_ratio'] = smithWatermanAlignment_node[4].text match_details_dict['align_pretty'] = smithWatermanAlignment_node[8].text match_details_dict['SW_alignment_seq1offset'] = int(smithWatermanAlignment_node.attrib['alignment-seq1offset']) match_details_dict['SW_alignment_seq2offset'] = int(smithWatermanAlignment_node.attrib['alignment-seq2offset']) match_details_dict['SW_identity'] = float(smithWatermanAlignment_node.attrib['identity']) match_details_dict['SW_similarity'] = float(smithWatermanAlignment_node.attrib['similarity']) #Get the full sequences. Note that they greatly increase the size of the csv file. match_details_dict['query_align_seq'] = smithWatermanAlignment_node[5].text match_details_dict['align_markup_seq'] = smithWatermanAlignment_node[6].text match_details_dict['match_align_seq'] = smithWatermanAlignment_node[7].text else: number_of_hits_missing_smithWatermanAlignment_node += 1 if hit_num == 1: #sort csv_header_for_SIMAP_homologue_file = sorted(list(match_details_dict.keys())) #save the csv header to the csv file writer = csv.writer(csvfile, delimiter=',', quotechar='"', lineterminator='\n',quoting=csv.QUOTE_NONNUMERIC, doublequote=True) writer.writerow(csv_header_for_SIMAP_homologue_file) #save the match_details_dict as a line in the csv file writer = csv.DictWriter(csvfile, fieldnames=csv_header_for_SIMAP_homologue_file, extrasaction='ignore', delimiter=',', quotechar='"', lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC, doublequote=True) writer.writerow(match_details_dict) # close tar and csv csvfile.close() tar.close() # open csv as a dataframe, df_homol = pd.read_csv(SIMAP_orig_csv, sep=",", quoting=csv.QUOTE_NONNUMERIC, index_col="hit_num") if "query_align_seq" not in df_homol.columns: # this is a serious error in the XML file. None of the hits had a protein node. The file should probably be downloaded. warning = 'The homologue XML file likely has a serious error, "query_align_seq" is not in dataframe. ' \ 'XML should probably be re-downloaded.\n' \ 'df_homol["hit_contains_SW_node"].value_counts()\n{}'.format(df_homol["hit_contains_SW_node"].value_counts()) logging.warning(warning) # skip this protein return acc, False, warning # get length of seq. Previously this was a lambda function that needed more filtering df_homol['len_query_align_seq'] = df_homol['query_align_seq'].str.len() # conduct the text searching for disallowed words words_not_allowed_in_description = ast.literal_eval(s["words_not_allowed_in_description"]) # collect disallowed words in hit protein description (patent, synthetic, etc) df_homol['list_disallowed_words_in_descr'] = df_homol['description'].dropna().apply(utils.find_disallowed_words, args=(words_not_allowed_in_description,)) # create a boolean column to select hits that do not contain these words in the description df_homol['disallowed_words_not_in_descr'] = df_homol['list_disallowed_words_in_descr'] == '[]' # check if there are non-IUPAC amino acids in the sequence (frequently large gaps from NG sequencing data) df_homol['X_in_match_seq'] = df_homol['match_align_seq'].str.contains("X") # restrict to just a few columns including the align_pretty that might be useful to check manually df_pretty = df_homol[["FASTA_gapped_identity", "obs_changes", "organism", "description", "align_pretty"]] # save the align_pretty to csv df_pretty.to_csv(p['SIMAP_align_pretty_csv'], sep=',', quoting=csv.QUOTE_NONNUMERIC) # drop the align_pretty column from the orig dataframe df_homol.drop('align_pretty', axis=1, inplace=True) # save the whole dataframe as a pickle for faster opening later with open(p['homol_df_orig_pickle'], "wb") as pick: pickle.dump(df_homol, pick, protocol=pickle.HIGHEST_PROTOCOL) simap_header_info_ser = pd.Series(sdict) simap_header_info_ser.to_csv(p['simap_header_info_csv']) # either create new zip and add ("w"), or open existing zip and add "a" with zipfile.ZipFile(p['homol_df_orig_zip'], mode="w", compression=zipfile.ZIP_DEFLATED) as zipout: #zipout.write(SIMAP_orig_csv, arcname=os.path.basename(SIMAP_orig_csv)) zipout.write(p['SIMAP_align_pretty_csv'], arcname=os.path.basename(p['SIMAP_align_pretty_csv'])) zipout.write(p['homol_df_orig_pickle'], arcname=os.path.basename(p['homol_df_orig_pickle'])) zipout.write(p['simap_header_info_csv'], arcname=os.path.basename(p['simap_header_info_csv'])) # delete temporary uncompressed files os.remove(SIMAP_orig_csv) os.remove(p['SIMAP_align_pretty_csv']) os.remove(p['homol_df_orig_pickle']) os.remove(p['simap_header_info_csv']) return acc, True, "0"
def get_omp_TM_indices_and_slice_from_summary_table( OMPdb_list_csv, list_parsed_csv, OMPdb_topology_reliability_cutoff, logging, s): """ Take a csv parsed from OMPdb, get the TM indices and slice the TMDs for each protein Parameters: ----------- OMPdb_list_csv : str Path to input csv with OMP sequences and membrane annotation list_summary_csv : str Path to output csv with the sliced TM sequences logging : logging.Logger Logger for printing to console and logfile. """ logging.info( '~~~~~~~~~starting get_omp_TM_indices_and_slice_from_summary_table~~~~~~~~~' ) df_KW = pd.read_csv(OMPdb_list_csv, sep=",", quoting=csv.QUOTE_NONNUMERIC, index_col=0) # check if signal peptides should be extracted, modify keywords dict analyse_SiPe = False if 'SiPe' in s['regions']: analyse_SiPe = True # get sequence length df_KW["seqlen"] = df_KW["Sequence"].str.len() # Creating new column M_indices, which contains the indices of Ms df_KW["M_indices"] = df_KW.Topology.apply(getting_membrane_indices) # Converting empty entries to NaN df_KW["M_indices"] = df_KW.M_indices.apply(lambda x: np.nan if x == [] else x) num_proteins_BEFORE_dropping_those_without_mem_indices = df_KW.shape[0] # Extracting entries to a new Dataframe df_KW = df_KW[df_KW.M_indices.notnull()] num_proteins_AFTER_dropping_those_without_mem_indices = df_KW.shape[0] # Filter, cutting of Coverages under 85% & Creating new Index df_KW = df_KW.loc[df_KW["Coverage(%)"] >= 85] # df_KW.index = range(1,len(df_KW["Uniprot"])+1) num_proteins_AFTER_dropping_those_with_coverage_below_85 = df_KW.shape[0] # Creating new list (nested list) nested_list_of_membrane_borders = [] # Filling nest with lists of start and end-points for n in df_KW.M_indices: m_borders = [] m_borders.append(n[0]) m_borders = check_for_border(n, m_borders) m_borders.append(n[-1]) nested_list_of_membrane_borders.append(m_borders) array_membrane_borders = np.array(nested_list_of_membrane_borders) array_membrane_borders_corrected = [] for subarray in array_membrane_borders: # logging.info(subarray[::2] = subarray[::2]*10) subarray = np.array(subarray) subarray[1::2] = subarray[1::2] + 1 array_membrane_borders_corrected.append(list(subarray)) nested_list_of_membrane_borders_python_indexstyle = array_membrane_borders_corrected # Creating new column, which contains start and end-points df_KW[ "Membrane_Borders"] = nested_list_of_membrane_borders_python_indexstyle # Creating new column, which contains the Amoung of TMDS df_KW["number_of_TMDs"] = df_KW.Membrane_Borders.apply( lambda x: len(x) / 2) # Filter, filters out, if less than 8 or more than 24 TMDs # REMOVED. FILTERING BY NUMBER OF TMDS IS NOW DONE LATER, in PROT_LIST #df_KW["number_of_TMDs"] = df_KW["number_of_TMDs"].apply(lambda x: int(x) if 5 <= x <= 36 else np.nan) # Creating new dataframe without nan df_KW = df_KW[df_KW["number_of_TMDs"].notnull()] num_proteins_AFTER_dropping_those_without_TMDs = df_KW.shape[0] df_KW = df_KW[df_KW["Topology_Reli"] > OMPdb_topology_reliability_cutoff] num_proteins_AFTER_dropping_those_with_topology_reliability_below_cutoff = df_KW.shape[ 0] df_KW["TM_indices"] = df_KW["Membrane_Borders"].apply( lambda x: tuple(zip(x[::2], x[1::2]))) # create a list of [TM01, TM02, TM03, etc. long_list_of_TMDs = [] for i in range(1, 50): long_list_of_TMDs.append("TM{:02d}".format(i)) # for the .set_value function, set dtype as object df_KW["list_of_TMDs"] = "" df_KW["list_of_TMDs"].astype(object) sys.stdout.write('slicing TMD and nonTMD sequences:\n') for row_nr, row in enumerate(df_KW.index): # get nested tuple of TMDs nested_tup_TMs = df_KW.loc[row, "TM_indices"] # slice long list of TMD names to get an appropriate list for that protein [TM01, TM02, TM03, etc. len_nested_tup_TMs = len(nested_tup_TMs) list_of_TMDs = long_list_of_TMDs[:len_nested_tup_TMs] # add that list to the dataframe (could also be added as a stringlist, but that's irritating somehow) df_KW.set_value(row, "list_of_TMDs", list_of_TMDs) # set seq for slicing full_seq = df_KW.loc[row, "Sequence"] # topology = df_KW.loc[row, "Topology"] # iterate through all the TMDs of that protein, slicing out the sequences for i in range(len(list_of_TMDs)): TMD = list_of_TMDs[i] tup = nested_tup_TMs[i] df_KW.loc[row, TMD + "_start"] = tup[0] df_KW.loc[row, TMD + "_end"] = tup[1] df_KW.loc[row, TMD + "_seq"] = utils.slice_with_listlike(full_seq, tup) # df_KW.loc[row, TMD + "_top"] = utils.slice_with_listlike(topology, tup) if row_nr % 50 == 0: sys.stdout.write(". ") sys.stdout.flush() if row_nr % 500 == 0: sys.stdout.write("\n") sys.stdout.flush() ''' ~~ SLICE nonTMD sequence ~~ ''' #list_of_TMDs = df_KW.loc[row, 'list_of_TMDs'].copy() if 'SP01' in list_of_TMDs: list_of_TMDs.remove('SP01') # sequence from N-term. to first TMD nonTMD_first = df_KW.loc[row, 'Sequence'][0:(df_KW.loc[row, 'TM01_start'] - 1).astype('int64')] sequence = nonTMD_first # only for multipass proteins, generate sequences between TMDs if len(list_of_TMDs) == 0: # no TMDs are annotated, skip to next protein continue elif len(list_of_TMDs) > 1: for TM_Nr in range(len(list_of_TMDs) - 1): # the TMD is the equivalent item in the list TMD = list_of_TMDs[TM_Nr] # the next TMD, which contains the end index, is the next item in the list next_TMD = list_of_TMDs[TM_Nr + 1] between_TM_and_TMplus1 = df_KW.loc[row, 'Sequence'][df_KW.loc[ row, '%s_end' % TMD].astype('int64'):df_KW.loc[row, '%s_start' % next_TMD].astype('int64') - 1] sequence += between_TM_and_TMplus1 last_TMD = list_of_TMDs[-1] # sequence from last TMD to C-term. nonTMD_last = df_KW.loc[row, 'Sequence'][df_KW.loc[ row, '%s_end' % last_TMD].astype('int64'):df_KW.loc[row, 'seqlen']] sequence += nonTMD_last df_KW.loc[row, 'nonTMD_seq'] = sequence df_KW.loc[row, 'len_nonTMD'] = len(sequence) if analyse_SiPe == True: if pd.notnull(df_KW.loc[row, 'SP01_start']): list_of_TMDs.append('SP01') df_KW.set_value(row, "list_of_TMDs", list_of_TMDs) ######################################################################################## # # # slicing out TMD_seq_plus_surr shifted to prot_list.py # # # ######################################################################################## # max_num_TMDs = df_KW["number_of_TMDs"].max() # # # n_aa_before_tmd = s["n_aa_before_tmd"] # # n_aa_after_tmd = s["n_aa_after_tmd"] # n_aa_before_tmd = 10 # n_aa_after_tmd = 10 # # # currently the loop is run for each TMD, based on the sequence with the most TMDs # for i in range(1, int(max_num_TMDs) + 1): # TMD = 'TM%02d' % i # # get the indices for TMD plus surrounding sequence # df_KW = korbinian.prot_list.prot_list.get_indices_TMD_plus_surr_for_summary_file(df_KW, TMD, n_aa_before_tmd, n_aa_after_tmd) # # slice out the TMD_seq_plus_surr for each TMD # df_KW['%s_seq_plus_surr' % TMD] = df_KW[df_KW['%s_start' % TMD].notnull()].apply(utils.slice_uniprot_TMD_plus_surr_seq, args=(TMD,), axis=1) # rename columns to match protein lists from uniprot (Note that Family is currently translated as prot_descr) dict_ = { "Sequence": "full_seq", "Organism": "organism", "Uniprot": "uniprot_acc", "Gene_Name": "gene_name", "Topology_Reli": "topology_reliability", "Family": "prot_descr" } df_KW["betabarrel"] = True df_KW["multipass"] = True df_KW["singlepass"] = False # since all beta-barrel proteins have the N-terminus in the periplasm, "N-term is Extracellular" is False # you could make 100% sure of this by checking that the first letter of "Topology" is "I", but it is not really necessary df_KW["n_term_ec"] = False df_KW.rename(columns=dict_, inplace=True) df_KW["acc"] = df_KW["uniprot_acc"] df_KW["protein_name"] = df_KW["uniprot_acc"] num_proteins_AFTER_get_omp_TM_indices_and_slice_from_summary_table = df_KW.shape[ 0] # save to csv (presumably in summaries folder as a list number, so it is accessible by the rest of the scripts) utils.make_sure_path_exists(list_parsed_csv, isfile=True) df_KW.to_csv(list_parsed_csv, sep=",", quoting=csv.QUOTE_NONNUMERIC) logging.info( "\nnum_proteins_BEFORE_dropping_those_without_mem_indices : {}".format( num_proteins_BEFORE_dropping_those_without_mem_indices)) logging.info( "num_proteins_AFTER_dropping_those_without_mem_indices : {}".format( num_proteins_AFTER_dropping_those_without_mem_indices)) logging.info( "num_proteins_AFTER_dropping_those_with_coverage_below_85 : {}".format( num_proteins_AFTER_dropping_those_with_coverage_below_85)) logging.info("num_proteins_AFTER_dropping_those_without_TMDs : {}".format( num_proteins_AFTER_dropping_those_without_TMDs)) logging.info( "num_proteins_AFTER_dropping_those_with_topology_reliability_below_cutoff : {}" .format( num_proteins_AFTER_dropping_those_with_topology_reliability_below_cutoff )) logging.info( "num_proteins_AFTER_get_omp_TM_indices_and_slice_from_summary_table : {}" .format( num_proteins_AFTER_get_omp_TM_indices_and_slice_from_summary_table )) logging.info( '~~~~~~~~~finished get_omp_TM_indices_and_slice_from_summary_table~~~~~~~~~' )
def parse_OMPdb_all_selected_to_csv(ListXX_OMPdb_nr_acc, ListXX_OMPdb_redundant_flatfile, OMPdb_list_csv, logging, s): """ Extracts ID, seq and topology data from the full OMPdb flatfile, saves to csv. Note that instead of parsing line-by-line and saving to a csv, this method store every single value into a huge dictionary, which for 3 proteins looks like this: BB_SiPe [True, True, False] Coverage(%) [99.82, 96.61, 92.19] Description [Pilin outer membrane usher protein SafC, Oute... NCBI_TaxID [59201, 470, 59201] Organism [Salmonella enterica I, Acinetobacter baumanni... SP01_start [1, 1, np.nan] Sequence [MKFKQPALLLFIAGVVHCANAHTYTFDASMLGDAAKGVDMSLFNQ... Topology [IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII... Topology_Reli [84.13, 94.00, 93.07] Uniprot [A0A0F7J6A4, A0A090B0L0, A0A0F7J6D5] seqlen [836, 356, 721] The size of this dictionary with >7000 entries may cause memory problems on a regular PC. Currently this method is functional, though, and is not a priority to be fixed. Parameters ---------- ListXX_OMPdb_nr_acc : str Path to OMPdb list of non-redundant IDs, textfile. ListXX_OMPdb_redundant_flatfile : str Path to full OMPdb flatfile, all proteins, unzipped. OMPdb_list_csv : str Path to output csv file. Saved Files ----------- OMPdb_list_csv : csv csv file derived from dfKW contains a row for each protein contains indices for TM regions Notes ----- This script parses the original text file, rather than XML. Version on server was not working due to " ' error. The script works, but goes through an entire flatfile, so is unbelievably slow. Use at your own risk. """ # Creating dictionary keywords keywords = { "Uniprot": [], "Family": [], "Gene_Name": [], "Organism": [], "NCBI_TaxID": [], "Coverage(%)": [], "Sequence": [], "Topology_Reli": [], "Topology": [], "Description": [], "Pfam_ID": [], "BB_SiPe": [], "seqlen": [], "fragment": [] } # check if signal peptides should be added to the list_of_TMDs and analysed # signal peptides will still be detected, via "True" in BB_SiPe. This is useful for excluding potential TM01 mis-labelled as SP. analyse_SiPe = False if 'SiPe' in s['regions']: analyse_SiPe = True keywords.update({ "SP01_start": [], "SP01_end": [], "SP01_seq": [], "SiPe_source": [] }) logging.info('analyse_SiPe: {}'.format(analyse_SiPe)) # Start settings which are changed during the for loop take_next_seq = False take_next_topo = False take_ID = False # Empty lists which are filled during the for loop Raw_Sequences = [] Raw_Topos = [] ID_list = [] # Extracts IDs out of file with open(ListXX_OMPdb_nr_acc) as source: for line in source: line = line.strip() ID_list.append(line) # Checking ListXX_OMPdb_redundant_flatfile(complete OMPdb in very unfriendly formatting)for IDs(which are stored in list of Potential IDs) and extracting information with open(ListXX_OMPdb_redundant_flatfile) as data_file: counter = 0 db_cross_ref = {} save_db_cross_ref = False for line in data_file: line_list = line.strip().split(" ") # Further settings which are changed every loop sequence_header = False topo_header = False # If-conditions make sure, that the ID is in the list of Potential IDs and check for keywords in each line if "UNIPROT" in line_list and line_list[-1] in ID_list: keywords["Uniprot"].append(line_list[-1]) take_ID = True counter += 1 if counter % 100 == 0: sys.stdout.write(". ") sys.stdout.flush() if "FAMILY" in line_list and take_ID == True: keywords["Family"].append(" ".join(line_list[9:])) if "DESCRIPTION" in line_list and take_ID == True: keywords["Description"].append(" ".join(line_list[4:])) if "GENE_NAME" in line_list and take_ID == True: keywords["Gene_Name"].append(" ".join(line_list[6:])) if "ORGANISM" in line_list and take_ID == True: keywords["Organism"].append(" ".join(line_list[7:])) if "NCBI_TAXID" in line_list and take_ID == True: keywords["NCBI_TaxID"].append(line_list[-1]) if "DB_REF" in line_list and take_ID == True: # add database cross references to special dict db_cross_ref.update( {line_list[9][:-1]: line_list[10].split('|')}) if "SIGNAL_PEPTIDE" in line_list and take_ID == True: if ' '.join(line_list[1:]) == 'No information available': keywords["BB_SiPe"].append(False) if analyse_SiPe == True: keywords["SP01_start"].append(np.nan) keywords["SP01_end"].append(np.nan) keywords["SP01_seq"].append(np.nan) keywords["SiPe_source"].append( 'No information available') else: # assume there is a signal peptide that starts at 1 (not very optimum code!!!) keywords["BB_SiPe"].append(True) if analyse_SiPe == True: keywords["SP01_start"].append(line_list[1][0]) keywords["SP01_end"].append(line_list[1][2:-1]) keywords["SP01_seq"].append(line_list[2][:-1]) keywords["SiPe_source"].append(' '.join( line_list[-2:])) if "COVERAGE(%)" in line_list and take_ID == True: keywords["Coverage(%)"].append(line_list[-1]) if "SEQUENCE" in line_list and take_ID == True: # after the "SEQUENCE" statement in a line_list, all db cross references are collected and can be saved save_db_cross_ref = True keywords["seqlen"].append(line_list[7]) take_next_seq = True sequence_header = True # some of the OMPdb entries are labeled as fragments. These should be removed. if "Fragment:" in line: searchstring = ".*Fragment\:([NC])" match = re.match(searchstring, line) if match: N_or_C = match.group(1) else: N_or_C = "undefined" keywords["fragment"].append("{}-term".format(N_or_C)) else: keywords["fragment"].append("undefined") # add db cross references from previous protein to keywords dict if save_db_cross_ref == True: if "Pfam" in db_cross_ref.keys(): keywords["Pfam_ID"].append(db_cross_ref["Pfam"]) else: keywords["Pfam_ID"].append(np.nan) # reset db_cross_ref for next cycle save_db_cross_ref = False db_cross_ref = {} if "TOPOLOGY" in line_list and take_ID == True: Raw_Sequences.extend(";") keywords["Topology_Reli"].append( line_list[-1].strip('"').strip("%")) take_next_seq = False take_next_topo = True topo_header = True if take_next_seq == True and sequence_header != True and take_ID == True: Raw_Sequences.extend(line_list) if "//" in line_list and take_ID == True: Raw_Topos.extend(";") topo_header = False take_next_topo = False take_ID = False if take_next_topo == True and topo_header != True: Raw_Topos.extend(line_list) Sequences = "".join(Raw_Sequences).split(";") Sequences.remove("") keywords["Sequence"] = Sequences Topos = "".join(Raw_Topos).split(";") Topos.remove("") keywords["Topology"] = Topos # Creating Dataframe and saving it as csv dfKW = pd.DataFrame(keywords) # set the uniprot_acc as the index dfKW.set_index("Uniprot", inplace=True, drop=False) dfKW.index.name = "acc" # DEPRECATED. OMPdb seems to label everything as a fragment? # n_prot_before_dropping_fragments = dfKW.shape[0] # dfKW = dfKW.loc[dfKW.fragment == "no fragment annotation"] # n_prot_after_dropping_fragments = dfKW.shape[0] # n_prot_fragments_dropped = n_prot_before_dropping_fragments - n_prot_after_dropping_fragments n_fragments = dfKW.loc[dfKW.fragment != "N"].shape[0] logging.info("{}/{} proteins labeled as 'Fragment:N' in flatfile.".format( n_fragments, dfKW.shape[0])) utils.make_sure_path_exists(OMPdb_list_csv, isfile=True) dfKW.to_csv(OMPdb_list_csv) logging.info("parse_OMPdb_all_selected_to_csv is completed.\n" "Final number of proteins = {}".format(dfKW.shape[0]))
def parse_flatfile_to_csv(selected_uniprot_records_flatfile, n_aa_before_tmd, n_aa_after_tmd, analyse_sp, logging, list_parsed_csv, slice=True): """ Parses a flatfile of UniProt records to csv. Parameters ---------- selected_uniprot_records_flatfile : str Path to UniProt flatfile containing selected records for analysis. n_aa_before_tmd : int Number of amino acids before the TMD to be included when slicing the "TMD_plus_surr". n_aa_after_tmd : int Number of amino acids before the TMD to be included when slicing the "TMD_plus_surr". analyse_sp : bool Whether to analyse the signal peptides. logging : logging.Logger Logger for printing to console and logfile. list_parsed_csv : str Path to output csv file containing the list of proteins for analysis. Dataframes ---------- dfu Dataframe for Uniprot index = acc for each protein columns = 'uniprot_acc', 'prot_descr', 'full_seq', etc Saved Files and Figures ----------------------- list_summary_csv : csv CSV from dfu, with info for a protein on each row. """ logging.info('~~~~~~~~~~~~ starting parse_flatfile_to_csv ~~~~~~~~~~~~') if not os.path.isfile(selected_uniprot_records_flatfile): return "parse_flatfile_to_csv could not be run. Uniprot flatfile not found. ({})".format(selected_uniprot_records_flatfile) uniprot_dict_all_proteins = {} with open(selected_uniprot_records_flatfile, "r") as f: records = SwissProt.parse(f) count_of_uniprot_records_processed = 0 for m, record in enumerate(records): # create an empty output dictionary to hold the uniprot data for each record output_dict = {} # extract the subcellular location detail from the (poorly organized and unsorted) uniprot comments section comments_dict = {} try: for comment in record.comments: # splits comments based on first ":" symbol, creates a list called split_comment split_comment = comment.strip().split(': ', 1) # several comments have the same name. need to check if it is already in the dictionary if split_comment[0] in comments_dict: # list the different comments, one after another comments_dict[split_comment[0]] += ", %s" % split_comment[1] else: comments_dict[split_comment[0]] = split_comment[1] output_dict['comments_subcellular_location_uniprot'] = comments_dict['SUBCELLULAR LOCATION'] except (AttributeError, KeyError): # there are no comments in this uniprot file! logging.info('no comments in Uniprot file') output_dict['comments_subcellular_location_uniprot'] = '' # use regex to search for text describing subcellular locations # [ -]? accepts either space, hyphen, or no dividing character regex_word_dict = {'multipass': ['multi[ -]?(pass|span)', 'poly[ -]?topic'], 'singlepass': ['single[ -]?(pass|span)', 'bi[ -]?topic'], 'membrane': ['membran', 'lipid[ -](anchor|bound)'], 'typeI': ['type[ -](one|1|I)[ -]membran'], 'typeII': ['type[ -](two|2|II)[ -]membran']} # comments_subcellular_location_uniprot = 'Membrane; Bitopictype I membrane protein.' regex_subcell_loc_dict = {} for search_word in regex_word_dict: regex_subcell_loc_dict[search_word] = False regex_search_list = regex_word_dict[search_word] for regex_search_string in regex_search_list: # search for the regex string, ignoring any mismatches in upper or lower case comment_match = re.search(regex_search_string, output_dict['comments_subcellular_location_uniprot'], re.IGNORECASE) if comment_match: regex_subcell_loc_dict[search_word] = True # add all of the fields to the dictionary output_dict.update(regex_subcell_loc_dict) # print accession number sys.stdout.write("{}, ".format(record.accessions[0])) if count_of_uniprot_records_processed % 20 == 0: sys.stdout.write("\n".format(record.accessions[0])) sys.stdout.flush() # add data to dictionary output_dict['uniprot_acc'] = record.accessions[0] output_dict['organism'] = record.organism output_dict['uniprot_entry_name'] = record.entry_name output_dict['gene_name'] = record.gene_name output_dict['prot_descr'] = record.description output_dict['full_seq'] = record.sequence output_dict['uniprot_orgclass'] = record.organism_classification output_dict['uniprot_all_accessions'] = record.accessions output_dict['uniprot_KW'] = record.keywords output_dict['uniprot_features'] = record.features output_dict['seqlen'] = record.sequence_length # create a list of all the feature types (signal, transmem, etc) list_of_feature_types_in_uniprot_record = [] for sublist in record.features: list_of_feature_types_in_uniprot_record.append(sublist[0]) # list of the features that we want in the final csv desired_features_in_uniprot = ['TRANSMEM', 'VARIANT', 'CONFLICT', 'VAR_SEQ', 'VARSPLIC', 'TOPO_DOM'] if analyse_sp == True: desired_features_in_uniprot.append('SIGNAL') desired_features_in_uniprot_dict = {} location_of_sp_in_feature_list = [] location_of_tmds_in_feature_list = [] location_of_non_tmds_in_feature_list = [] # add bool if uniprot thinks that protein contains signal peptides if 'SIGNAL' in list_of_feature_types_in_uniprot_record: output_dict['uniprot_SiPe'] = True else: output_dict['uniprot_SiPe'] = False for feature in desired_features_in_uniprot: if feature in list_of_feature_types_in_uniprot_record: # find the features in the feature list. # For polytopic membrane proteins, there will be more than one tmd (labelled "TRANSMEM". location_of_features_in_feature_list = [i for i, x in enumerate(list_of_feature_types_in_uniprot_record) if x == feature] desired_features_in_uniprot_dict[feature] = location_of_features_in_feature_list if feature == 'SIGNAL': location_of_sp_in_feature_list = location_of_features_in_feature_list # location_of_sp_in_feature_list.sort() if feature == 'TRANSMEM': location_of_tmds_in_feature_list = location_of_features_in_feature_list # sort list to be sure that the "transmem" notation is definitely ordered correctly, # as this order determines the TMD name location_of_tmds_in_feature_list.sort() if feature == 'TOPO_DOM': location_of_non_tmds_in_feature_list = location_of_features_in_feature_list # sort list to be sure that the "transmem" notation is definitely ordered correctly, # as this order determines the TMD name location_of_non_tmds_in_feature_list.sort() # count the number of SP output_dict['number_of_SP'] = len(location_of_sp_in_feature_list) # count the number of "TRANSMEM" TMDs listed in the feature-list output_dict['number_of_TMDs'] = len(location_of_tmds_in_feature_list) # information about location of first non-tmd (extracellular or periplasmic/cytoplasmic) if len(location_of_non_tmds_in_feature_list) > 0: output_dict['loc_start'] = record.features[location_of_non_tmds_in_feature_list[0]][3] output_dict['n_term_ec'] = "Extracellular" in output_dict["loc_start"] else: output_dict['loc_start'] = np.nan output_dict['n_term_ec'] = np.nan # number of TMDs excluding signal peptides (which might be added later) number_of_TMDs_excl_SP = len(location_of_tmds_in_feature_list) output_dict['number_of_TMDs_excl_SP'] = number_of_TMDs_excl_SP list_of_TMDs = ["TM{:02d}".format(n) for n in range(1, number_of_TMDs_excl_SP + 1)] output_dict['list_of_TMDs'] = list_of_TMDs output_dict['list_of_TMDs_excl_SP'] = list_of_TMDs if number_of_TMDs_excl_SP > 0: #list_of_TMDs = [] TM_indices = [] for n, TMD_location in enumerate(location_of_tmds_in_feature_list): # consecutively number the TMDs based on the "TRANSMEM" location in the feature list #TMD = 'TM{:02d}'.format(n+1) #list_of_TMDs.append(TMD) TM_start = record.features[TMD_location][1] TM_end = record.features[TMD_location][2] # remove any strings or floats for TM_index in [TM_start, TM_end]: TM_index = TM_index if isinstance(TM_index, int) else np.nan # add to nested tuple TM_indices.append((TM_start, TM_end)) # DEPRECATED # # add the start and stop of each TMD, and the comments # output_dict['%s_start'%TMD] = record.features[TMD_location][1] # output_dict['%s_end'%TMD] = record.features[TMD_location][2] # output_dict['%s_description'%TMD] = record.features[TMD_location][3] # if isinstance(output_dict['%s_start'%TMD], str) or isinstance(output_dict['%s_end'%TMD], str): # logging.info("{} strings found in feature indices: {},{}".format(output_dict['uniprot_acc'], output_dict['%s_start'%TMD], output_dict['%s_end'%TMD])) # output_dict['%s_start' % TMD], output_dict['%s_end'%TMD] = np.nan, np.nan # information about SP location if output_dict['number_of_SP'] != 0: for SP_location in location_of_sp_in_feature_list: SP = 'SP01' list_of_TMDs.append(SP) for SP_location in location_of_sp_in_feature_list: output_dict['SP01_start'] = record.features[SP_location][1] output_dict['SP01_end'] = record.features[SP_location][2] output_dict['SP01_description'] = record.features[SP_location][3] if isinstance(output_dict['SP01_start'], str) or isinstance(output_dict['SP01_end'], str): logging.info("{} strings found in feature indices: {},{}".format(output_dict['uniprot_acc'], output_dict['SP01_start'], output_dict['SP01_end'])) output_dict['SP01_start'], output_dict['SP01_end'] = np.nan, np.nan # add the list of TMD names to the dictionary and dataframe #output_dict['list_of_TMDs'] = list_of_TMDs output_dict["TM_indices"] = TM_indices # create a numpy array of any sequence variants are in the TMD (and SP) region list_of_variant_types_in_uniprot = ['VARIANT', 'CONFLICT', 'VARSPLIC', 'VAR_SEQ'] for n, TMD in enumerate(list_of_TMDs): TM_start = TM_indices[n][0] TM_start = TM_indices[n][1] # array_of_all_variants_in_tmd = np.zeros(4) array_of_all_variants_in_tmd = np.array([]) for variant_type in list_of_variant_types_in_uniprot: if variant_type in desired_features_in_uniprot_dict.keys(): # if that variant is in the uniprot data for that protein, create a list of the indices showing where that variant is found list_of_variant_locations = list(desired_features_in_uniprot_dict[variant_type]) # get the specific start, end and details of that variant for v in range(len(list_of_variant_locations)): # get start start_of_variant_in_seq = record.features[list_of_variant_locations[v]][1] # get end end_of_variant_in_seq = record.features[list_of_variant_locations[v]][2] # get description variant_description = record.features[list_of_variant_locations[v]][3] variant_feature_identifier = record.features[list_of_variant_locations[v]][4] # check if the variant is in the tmd start_of_variant_is_after_start_of_tmd = True if start_of_variant_in_seq > TM_start else False end_of_variant_is_before_end_of_tmd = True if end_of_variant_in_seq < TM_start else False variant_is_in_tmd = True if all([start_of_variant_is_after_start_of_tmd, end_of_variant_is_before_end_of_tmd]) else False # if the variants are the tmd region, add to numpy array if variant_is_in_tmd: # create array of the variant data variant_array = np.array( [variant_type, start_of_variant_in_seq, end_of_variant_in_seq, variant_description, variant_feature_identifier]) if array_of_all_variants_in_tmd.size != 0: # add array with the data for this variant to the array/list for all variants array_of_all_variants_in_tmd = np.row_stack( (array_of_all_variants_in_tmd, variant_array)) else: # if the array is empty, replace the array for all variants with the array for the first variant array_of_all_variants_in_tmd = variant_array # if there were variants added (array is not empty), convert to string and add them to the output dictionary if array_of_all_variants_in_tmd.size: output_dict['%s_seq_variants'%TMD] = str(array_of_all_variants_in_tmd) count_of_uniprot_records_processed += 1 # nest each dictionary containing the data for each protein into a large dictionary that contains all data from all proteins uniprot_dict_all_proteins[output_dict['uniprot_acc']] = output_dict # convert that nested dict into a pandas dataframe, transverse dfu = pd.DataFrame(uniprot_dict_all_proteins).sort_index().T # count records in dataframe count_of_initial_uniprot_records = dfu.shape[0] # make a unique list of all TMD combinations in list([TM01], [TM01, TM03], etc) unique_TMD_combinations_orig = list(dfu.list_of_TMDs.astype(str).unique()) # convert to python list unique_TMD_combinations_lists = [ast.literal_eval(s) for s in unique_TMD_combinations_orig if "nan" not in s] # grab all unique values into a large list(e.g. TM01, TM02, TM03 until last TM of protein with most TMs) unique_TMD_combinations_single_list = [i for i in itertools.chain.from_iterable(unique_TMD_combinations_lists)] # sort list_all_TMDs_in_dataset = sorted(list(set(unique_TMD_combinations_single_list))) # extract the organism domain (e.g. Eukaryota) dfu['uniprot_orgclass'] = dfu['uniprot_orgclass'].astype(str) dfu['organism_domain'] = dfu.uniprot_orgclass.apply(lambda x: x.strip("'[]").split("', '")[0]) # convert python datatypes to strings, as these currently give a TypeError when saving to excel dfu['uniprot_all_accessions'] = dfu['uniprot_all_accessions'].astype(str) dfu['uniprot_KW'] = dfu['uniprot_KW'].astype(str) dfu['uniprot_features'] = dfu['uniprot_features'].astype(str) dfu['list_of_TMDs'] = dfu['list_of_TMDs'].astype(str) dfu['topol_source'] = "UniProt" #Hotfix: "UniProt" instead of "uniprot" or #else all proteins will be filtered out later # save to a csv utils.make_sure_path_exists(list_parsed_csv, isfile=True) # count records in dataframe count_of_uniprot_records_added_to_csv = dfu.shape[0] dfu.to_csv(list_parsed_csv, sep=",", quoting=csv.QUOTE_NONNUMERIC) return '\n%i valid UniProt records parsed to csv (from %i initial)\n~~~~~~~~~~~~ finished parse_flatfile_to_csv ~~~~~~~~~~~~' % (count_of_uniprot_records_added_to_csv, count_of_initial_uniprot_records)
def download_homologues_from_simap(pathdict, s, logging): """From the list of proteins in csv format, begins downloading homologues from the SIMAP database. - opens the csv file containing the list of proteins - opens or creates a text file with the list of failed downloads - checks if there is enough hard-drive space - checks what files currently exist (feature table, homologue, zip) - tries to download feature table (if download_feature_tables listed as TRUE in settings) - tries to download homologues - if both feature table and homologues exist, compresses both into a tarball and deletes original files - counts the number of failed downloads. Assumes most failed downloads are due to server errors on the SIMAP side. With more and more failed downloads, sleeps for longer and longer. Parameters ---------- pathdict : dict Dictionary of the key paths and files associated with that List number. s : dict Settings dictionary extracted from excel settings file. logging : logging.Logger Logger for printing to console and logfile. Saved Files and Figures ----------------------- PROTEIN_NAME_SIMAP.tar.gz : gzip file (e.g. A1A5B4_ANO9_HUMAN_SIMAP.tar.gz) Contains -------- PROTEIN_NAME_feature_table.xml (e.g. A1A5B4_ANO9_HUMAN_feature_table.xml) XML feature table from SIMAP, with information regarding each protein. PROTEIN_NAME_homologues.xml (e.g. A1A5B4_ANO9_HUMAN_homologues.xml) homologues from SIMAP in SIMAP-XML (rather than BLAST-XML) format PROTEIN_NAME--DATE--RESEARCHERNAME.txt (e.g. A1DT13_A1DT13_HUMAN--20160811--Mark Teese.txt) [only in later versions] Text file showing the download date and researcher name. pathdict["failed_downloads_txt"] : txt File containing a list of accessions that could not be downloaded. At each run, the program checks if this file exists. If it doesn't exist, it will be created. If it exists, the settings file determines whether the previously failed downloads will be re-attempted. """ logging.info("~~~~~~~~~~~~ starting download_homologues_from_simap ~~~~~~~~~~~~") df = pd.read_csv(pathdict["list_csv"], sep = ",", quoting = csv.QUOTE_NONNUMERIC, index_col = 0) if s["attempt_prev_failed_downloads"] == False: # get list of accessions that could not be downloaded, and can immediately be excluded acc_list_failed_downloads = utils.get_acc_list_from_txt(pathdict["failed_downloads_txt"]) not_in_homol_db = utils.get_acc_list_from_txt(pathdict["acc_not_in_homol_db_txt"]) acc_excluded_list = acc_list_failed_downloads + not_in_homol_db # the list of desired proteins = total_list - excluded acc_not_excluded = list(set(df.index) - set(acc_excluded_list)) # filter dataframe to only contain the desired proteins, which have not been excluded df = df.loc[acc_not_excluded, :] max_hits = s["max_hits"] java_exec_str = s["java_exec_str"] max_memory_allocation = s["java_max_RAM_memory_allocated_to_simap_download"] taxid = s["taxid"] # eg.'7227' for Drosophila melanogaster # if "Linux" in platform.system() or "Windows" in platform.system(): # # if Linux or Windows # byteformat = "GB" # data_harddrive = os.path.splitdrive(s["data_dir"])[0] # # print initial hard-drive space # size = utils.get_free_space(data_harddrive, byteformat) # logging.info('Hard disk remaining space = {}'.format(size)) # else: # # assume the system is a mac # # code works only on mac!!! reverted to previous version # statvfs = os.statvfs(s["simap_dir"]) # available_space = statvfs.f_frsize * statvfs.f_bavail # size = available_space / 1073741824 # # print initial hard-drive space # logging.info('Hard disk remaining space = {:.2f} GB'.format(size)) #iterate over each uniprot record contained in the dataframe. note that acc = uniprot accession number number_of_files_not_found = 0 for acc in df.index: # check hand-drive space before each download # try: # if "Linux" in platform.system() or "Windows" in platform.system(): # size = utils.get_free_space(data_harddrive, byteformat) # if size[0] < 5: # raise utils.HardDriveSpaceException("Hard drive space limit reached, there is only %s %s space left." % (size[0], size[1])) # else: # # MAC only stuff... # statvfs = os.statvfs(s["simap_dir"]) # available_space = statvfs.f_frsize * statvfs.f_bavail # size = available_space / 1073741824 # if size < 5: # raise utils.HardDriveSpaceException("Hard drive space limit reached, there is only %s %s space left." % (size[0], size[1])) # except utils.HardDriveSpaceException as e: # logging.warning(e) protein_name = df.loc[acc, 'protein_name'] seqlen = df.loc[acc, 'seqlen'] input_sequence = df.loc[acc, 'full_seq'] SIMAP_tar = df.loc[acc, 'SIMAP_tar'] ft_xml_path = df.loc[acc, 'SIMAP_feature_table_XML_path'] homol_xml_path = df.loc[acc, 'SIMAP_homol_XML_path'] date_file_path = df.loc[acc, 'SIMAP_download_date_file_path'] # create directories to hold file, if necessary utils.make_sure_path_exists(homol_xml_path, isfile=True) #check which files exist and delete corrupt tarballs ft_XML_exists, homol_XML_exists, SIMAP_tar_exists, ff, hh = utils.check_SIMAP_tarfile(SIMAP_tar, ft_xml_path, homol_xml_path, acc, logging, delete_corrupt=True) ''' Windows command prompt accepts only 8191 characters. Limit protein length according to settings (typically max length = 3000) ''' if 'Windows' in str(platform.system()): too_large_to_download_list = utils.get_acc_list_from_txt(pathdict["too_large_to_download_txt"]) if seqlen > s["max_query_sequence_length"]: logging.warning('%s homologue download will be skipped. It cannot be processed into a java command in windows OS,' 'as the sequence is longer than %i characters (%i). Moving to next sequence' % (protein_name, s["max_query_sequence_length"],seqlen)) # if the accession is not already in the text file, add it if acc not in too_large_to_download_list: # add accession number to the list of failed downloads with open(pathdict["too_large_to_download_txt"], "a") as source: source.write("\n{}".format(acc)) # skip this protein continue if SIMAP_tar_exists and s["overwrite_homologue_files"] == False: # skip this protein logging.info("{} SIMAP_tar_exists, download skipped.".format(acc)) continue eaSimap_path = os.path.join(s["data_dir"], "programs", "eaSimap.jar") # NOTE: DOWNLOADING FEATURE TABLES IS NO LONGER CONSIDERED NECESSARY. if not ft_XML_exists and s["download_feature_tables"] == True: #download feature table from SIMAP korbinian.simap_download.retrieve_simap_feature_table(input_sequence, java_exec_str=java_exec_str, max_memory_allocation=500, output_file=ft_xml_path, eaSimap_path=eaSimap_path) utils.sleep_x_seconds(60) if not homol_XML_exists: #download homologue file from SIMAP korbinian.simap_download.retrieve_simap_homologues(input_sequence, output_file=homol_xml_path, max_hits=max_hits, java_exec_str=java_exec_str, max_memory_allocation=max_memory_allocation, taxid=taxid, eaSimap_path=eaSimap_path) # sometimes the SIMAP server seems to like a little rest in between downloads? utils.sleep_x_seconds(30) #now check again if the files exist ft_XML_exists, homol_XML_exists, SIMAP_tar_exists, ff, hh = utils.check_SIMAP_tarfile(SIMAP_tar, ft_xml_path, homol_xml_path, acc, logging) if not homol_XML_exists: # add accession number to the list of failed downloads with open(pathdict["failed_downloads_txt"], "a") as source: source.write("\n{}".format(acc)) #add one to the list of consecutive failed downloads. number_of_files_not_found += 1 if s["sleep_if_downloads_unsuccessful"]: # if a large number of downloads failed, then the SIMAP server is probably not working. # Wait some time and try again later. if number_of_files_not_found > 30: sys.stdout.write("\nnumber_of_files_not_found = {}, sleeping for 24 h".format(number_of_files_not_found)) utils.sleep_x_hours(24) if number_of_files_not_found == 20: sys.stdout.write("\nnumber_of_files_not_found = {}, sleeping for 6 h".format(number_of_files_not_found)) utils.sleep_x_hours(6) if number_of_files_not_found == 15: sys.stdout.write("\nnumber_of_files_not_found = {}, sleeping for 1 h".format(number_of_files_not_found)) utils.sleep_x_hours(1) else: # if download is successful or file exists, the SIMAP server must be working, # therefore reset the number_of_files_not_found number_of_files_not_found = 0 # create an empty text file with the download date date = strftime("%Y%m%d") with open(date_file_path, "w") as f: f.write("{}\nEmpty file with the date.\nHave a nice day!".format(date)) with tarfile.open(SIMAP_tar, mode='w:gz') as tar: #add the files to the compressed tarfile logging.info('%s XML files will be moved into the tarball, original XML files deleted' % protein_name) tar.add(homol_xml_path, arcname=os.path.basename(homol_xml_path)) tar.add(date_file_path, arcname=os.path.basename(date_file_path)) if ft_XML_exists: tar.add(ft_xml_path, arcname=os.path.basename(ft_xml_path)) #delete the original files try: os.remove(homol_xml_path) os.remove(date_file_path) if ft_XML_exists: os.remove(ft_xml_path) except FileNotFoundError: pass logging.info("~~~~~~~~~~~~ finished download_homologues_from_simap ~~~~~~~~~~~~")
def run_statements(s): list_number = s["list_number"] # setup error logging logging = korbinian.common.setup_keyboard_interrupt_and_error_logging( s, list_number) # print the list number describing the protein list logging.warning("list_number : {}".format(list_number)) # open the tab containing the list-specific settings as a dataframe df_list_settings = pd.read_excel(s["excel_file_with_settings"], sheetname="lists", index_col=0) relevant_row = df_list_settings.loc[list_number, :].to_dict() if np.nan in relevant_row.values(): raise ValueError( "The row for List{} in the lists tab of the settings file is missing some values." .format(list_number)) # add the relevant row (e.g. for List01) to the existing settings dictionary # this adds max_lipo_homol, rand_TM, rand_nonTM, etc to the dictionary s.update(relevant_row) # set a base folder for the summaries, e.g. "D:\Databases\summaries\05\" for list 05 base_filename_summaries = os.path.join(s["data_dir"], "summaries", '%02d' % list_number, 'List%02d' % list_number) # create dictionary of paths for output files # for example the basic pathdict["list_csv"] for list 5 is "D:\Databases\summaries\05\List05_summary.csv" pathdict = korbinian.common.create_pathdict(base_filename_summaries, s) utils.make_sure_path_exists(pathdict["settings_copy_xlsx"], isfile=True) # copy the settings file used for the analysis copyfile(s["excel_file_with_settings"], pathdict["settings_copy_xlsx"]) ######################################################################################## # # # prot_list, OMPdb (create a list of proteins from the OMPdb database) # # # ######################################################################################## if s["OMPdb_extract_omp_IDs_from_nr_fasta"]: ListXX_OMPdb_nr_fasta = os.path.join( s["data_dir"], "OMPdb", "List{:02d}_OMPdb_nr_fasta.txt".format(list_number)) ListXX_OMPdb_nr_acc = os.path.join( s["data_dir"], "OMPdb", "List{:02d}_OMPdb_nr_acc.txt".format(list_number)) korbinian.prot_list.parse_OMPdb.extract_omp_IDs_from_nr_fasta( ListXX_OMPdb_nr_fasta, ListXX_OMPdb_nr_acc, logging) if s["OMPdb_parse_OMPdb_all_selected_to_csv"]: ListXX_OMPdb_nr_acc = os.path.join( s["data_dir"], "OMPdb", "List{:02d}_OMPdb_nr_acc.txt".format(list_number)) ListXX_OMPdb_redundant_flatfile = os.path.join( s["data_dir"], "OMPdb", "List{:02d}_OMPdb_redundant_flatfile.flat".format(list_number)) OMPdb_list_csv = os.path.join( s["data_dir"], "OMPdb", "List{:02d}_OMPdb_summary.csv".format(list_number)) korbinian.prot_list.parse_OMPdb.parse_OMPdb_all_selected_to_csv( ListXX_OMPdb_nr_acc, ListXX_OMPdb_redundant_flatfile, OMPdb_list_csv, logging, s) if s["OMPdb_get_TM_indices_and_slice"]: OMPdb_list_csv = os.path.join( s["data_dir"], "OMPdb", "List{:02d}_OMPdb_summary.csv".format(list_number)) list_parsed_csv = pathdict["list_parsed_csv"] OMPdb_topology_reliability_cutoff = s[ "OMPdb_topology_reliability_cutoff"] korbinian.prot_list.parse_OMPdb.get_omp_TM_indices_and_slice_from_summary_table( OMPdb_list_csv, list_parsed_csv, OMPdb_topology_reliability_cutoff, logging, s) ######################################################################################## # # # prot_list, UniProt (create a list of proteins from the UniProt database) # # # ######################################################################################## # define the uniprot directory with selected records uniprot_dir = os.path.join(s["data_dir"], 'uniprot') selected_uniprot_records_flatfile = os.path.join( uniprot_dir, 'selected', 'List%02d_selected_uniprot_records_flatfile.txt' % list_number) if s["parse_large_flatfile_with_list_uniprot_accessions"]: input_accession_list_path = os.path.join( s["data_dir"], "uniprot", "selected", "List{:02d}_uniprot_accessions.txt".format(list_number)) korbinian.prot_list.uniprot_retrieve.parse_large_flatfile_with_list_uniprot_accessions( s, input_accession_list_path, uniprot_dir, logging, selected_uniprot_records_flatfile) if s["retrieve_uniprot_data_for_acc_list_in_xlsx_file"]: input_uniprot_flatfile = "function not implemented!" excelfile_with_uniprot_accessions = os.path.join( base_filename_summaries, '.xlsx') korbinian.prot_list.uniprot_retrieve.retrieve_uniprot_data_for_acc_list_in_xlsx_file( excelfile_with_uniprot_accessions, input_uniprot_flatfile, selected_uniprot_records_flatfile, logging) if s["create_nonred_uniprot_flatfile_via_uniref"] == True: korbinian.prot_list.uniprot_nonredundant.create_nonred_uniprot_flatfile_via_uniref( s, uniprot_dir, selected_uniprot_records_flatfile, logging) if s["parse_flatfile_to_csv"]: n_aa_before_tmd = s["n_aa_before_tmd"] n_aa_after_tmd = s["n_aa_after_tmd"] list_parsed_csv = pathdict["list_parsed_csv"] # whether to analyse signal peptides for this dataset analyse_sp = True if "SiPe" in s["regions"] else False output = korbinian.prot_list.uniprot_parse.parse_flatfile_to_csv( selected_uniprot_records_flatfile, n_aa_before_tmd, n_aa_after_tmd, analyse_sp, logging, list_parsed_csv) logging.info(output) ######################################################################################## # # # Membrane protein filtering # # # ######################################################################################## if s["TM_filtering_SCAMPI"]: korbinian.filtering.scampi.run_filtering(pathdict, s, logging) if s["TM_filtering_SignalP"]: korbinian.filtering.signalP.run_filtering(pathdict, s, logging) if s["TM_filtering_TMSEG_PSI-BLAST_createDatabase"]: korbinian.blast.psiblast.create_BLAST_database(pathdict, s, logging) if s["TM_filtering_TMSEG_PSI-BLAST"]: korbinian.blast.psiblast.run_BLAST(pathdict, s, logging) if s["TM_filtering_TMSEG"]: korbinian.filtering.tmseg.run_filtering(pathdict, s, logging) ######################################################################################## # # # prepare_protein_list # # # ######################################################################################## if s["prepare_protein_list"]: korbinian.prot_list.prot_list.get_topology_for_prot_list( s, pathdict, logging) korbinian.prot_list.prot_list.slice_TMDs_in_prot_list( s, pathdict, logging) korbinian.prot_list.prot_list.prepare_protein_list( s, pathdict, logging) if s['generate_scampi_input_files']: korbinian.prot_list.SCAMPI.generate_scampi_input_files( pathdict, s, logging) if s['generate_SignalP_input_files']: korbinian.prot_list.SCAMPI.generate_SignalP_input_files( pathdict, s, logging) ######################################################################################## # # # run simap download, parse simap # # # ######################################################################################## if s["download_homologues"]: korbinian.simap_download.download_homologues_from_simap( pathdict, s, logging) if s["parse_simap_to_csv"]: korbinian.simap_parse.run_parse_simap_to_csv(pathdict, s, logging) ######################################################################################## # # # run BLAST and parse results # # # ######################################################################################## if s["BLASTp_search"]: if s["BLAST_modus"] == "online": korbinian.blast.blastp.run_BLAST_online(pathdict, s, logging) elif s["BLAST_modus"] == "local": korbinian.blast.blastp.run_BLAST_local(pathdict, s, logging) if s["BLAST_parser"]: korbinian.blast.blast_parser.run(pathdict, s, logging) ######################################################################################## # # # run_create_fasta, run_calculate_AAIMON_ratios # # # ######################################################################################## if s["slice_TMDs_from_homologues"]: korbinian.cons_ratio.slice.run_slice_TMDs_from_homologues( pathdict, s, logging) if s["create_fasta"]: korbinian.fasta.run_create_fasta(pathdict, s, logging) if s["calculate_AAIMON_ratios"]: korbinian.cons_ratio.cons_ratio.run_calculate_AAIMONs( pathdict, s, logging) if s['filter_truncated_alignments']: korbinian.cons_ratio.cons_ratio.throw_out_truncated_sequences( pathdict, s, logging) if s["gather_AAIMON_ratios"]: # reassign pathdict that could have been recreated during gather depending on settings pathdict = korbinian.cons_ratio.gather.gather_AAIMONs( pathdict, logging, s) ######################################################################################## # # # gap density analysis # # # ######################################################################################## if s["calculate_gap_densities"]: korbinian.gap.run_calculate_gap_densities(pathdict, s, logging) if s["gather_gap_densities"]: korbinian.gap.gather_gap_densities(pathdict, s, logging) if s["create_graph_of_gap_density"]: korbinian.gap_figs.create_graph_of_gap_density(pathdict, s, logging) if s["save_fastagap"]: korbinian.fastagap.save_fastagap(pathdict, s, logging) if s["calc_fastagap_densities"]: korbinian.fastagap.run_calc_fastagap_densities(pathdict, s, logging) ######################################################################################## # # # conservation ratio (AAIMON ratio) figures # # # ######################################################################################## if s["run_keyword_analysis"]: output = korbinian.cons_ratio.keywords.keyword_analysis( pathdict, s, logging) logging.info(output) '''+++++++++++++++ Summary figures describing the conservation ratios of proteins in the list ++++++++++++++++++''' if s["save_figures_describing_proteins_in_list"]: return_statement = korbinian.cons_ratio.figs.save_figures_describing_proteins_in_list( pathdict, s, logging) logging.info(return_statement) '''+++++++++++++++ Summary figures describing the conservation ratios of proteins in the list ++++++++++++++++++''' # if s["compare_lists"]: # korbinian.cons_ratio.compare_lists_old.compare_rel_con_lists(pathdict, s, logging) if "gather_pretty_alignments" in s.keys(): if s["gather_pretty_alignments"]: korbinian.cons_ratio.gather.gather_pretty_alignments( pathdict, logging, s) if s['send_email_when_finished']: korbinian.utils.send_email_when_finished(s, pathdict) sys.stdout.write( '\n~~~~~~~~~~~~ List {} finished ~~~~~~~~~~~~\n'. format(list_number))
def run_psiblast_on_fasta_queries_in_folder(query_dir, databases_dir, psiblast_exec_str, db, timeout_h, retry_failed=False, retry_successful=False, retry_timeout=False, n_threads=5): """Runs standalone PSIBLAST on every query fasta file in a folder. What you need: - query folder with I3L0P3.fasta, P42532.fasta etc - standalone BLAST must be working - databases folder with databases\\uniref\\vertebra90\\vertebra90.fasta, or databases\\uniref\\uniref90\\uniref90.fasta etc Where are the output files saved? - databases\\BLAST\\PSI\\vertebra90\\P4\\P42532_PSIBLAST.tar.gz Parameters ---------- query_dir : str Folder containing protein sequences in fasta format. databases_dir : str Databases directory, e.g. "D:\Databases" psiblast_exec_str : str Path to psiblast executable if you are using linux or your Windows environmental variables are working, can simply be "psiblast" db : str Database for PSI-BLAST e.g. "metazoa90" Determines the filepath for the .fasta containing the search database. timeout_h : int Hours allotted before timeout in PSIBLAST command. Since some proteins are extremely slow, suggest making a quick run through the list first (2 hr), and if time permits, a slow run later for the last few (6h). retry_failed : bool If True, proteins in the list of failed acc will be re-attempted retry_successful : bool If True, previous files will be overwritten Usage ----- %capture from korbinian.psiblast import run_psiblast_on_fasta_queries_in_folder # SET YOUR GENERAL DATABASES DIRECTORY databases_dir = r"D:\\Databases" # uniref files must be in databases\\uniref\\database # SET THE DIRECTORY CONTAINING THE FASTA SEQUENCES YOU WANT TO USE AS A QUERY query_dir = r"D:\\Databases\\xtestproteins\\BLAST_small" # SET YOUR BLAST EXECUTABLE (windows) psiblast_exec_str = r"C:\\Program Files\\NCBI\\blast-2.6.0+\\bin\\psiblast.exe" run_psiblast_on_fasta_queries_in_folder(query_dir, databases_dir, psiblast_exec_str) """ # set location of logfile date_string = time.strftime("%Y%m%d") logfile = os.path.join(query_dir, "{}_PSI-BLAST_logfile.txt".format(date_string)) logging = korbinian.common.setup_error_logging(logfile) # set location of txt file containing the failed sequences failed_psiblast_list_txt = os.path.join(query_dir, "failed_PSIBLAST_list.txt") timeout_psiblast_list_txt = os.path.join(query_dir, "timeout_PSIBLAST_list.txt") ######################################################################################## # # # Create a list of all FASTA files in a particular folder for analysis # # # ######################################################################################## query_fasta_list = glob.glob(query_dir + "\*.fasta") logging.info("query_fasta_list[0:5] : {}".format(query_fasta_list[0:5])) ######################################################################################## # # # Get list of previously failed sequences # # # ######################################################################################## if os.path.isfile(failed_psiblast_list_txt): failed_psiblast_list = utils.get_acc_list_from_txt( failed_psiblast_list_txt) else: failed_psiblast_list = [] if os.path.isfile(timeout_psiblast_list_txt): timeout_psiblast_list = utils.get_acc_list_from_txt( timeout_psiblast_list_txt) else: timeout_psiblast_list = [] logging.info("failed_psiblast_list[0:5] : {}".format( failed_psiblast_list[0:5])) logging.info("timeout_psiblast_list[0:5] : {}".format( timeout_psiblast_list[0:5])) ######################################################################################## # # # create a dictionary, s, with various parameters # # can be converted to a korbinian dictionary later # # # ######################################################################################## s = {} s["psiblast_exec_str"] = psiblast_exec_str s["evalue"] = "1e-5" s["inclusion_ethresh"] = "1e-5" s["num_threads"] = n_threads # s["db"] = "metazoa90" s["num_descriptions"] = 3000 s["num_alignments"] = 3000 command_str = '"{psiblast_exec_str}" -query {query} -db {db} -out_pssm {out_pssm} -out_ascii_pssm {out_ascii_pssm} '\ '-out {out_BLAST_xml} -evalue {evalue} -inclusion_ethresh {inclusion_ethresh} -num_iterations 3 '\ '-use_sw_tback -seg no -num_threads {num_threads} -num_descriptions {num_descriptions} -num_alignments {num_alignments} -comp_based_stats 1' logging.info("Example of command str, before inserting variables".format( command_str)) ######################################################################################## # # # Run PSI-BLAST for each query sequence: # # input: query.fas, database.fasta (after makeblastdb) # # output: query.pssm, query_ascii.pssm, query_BLAST.xml, query_date.txt # # (compressed into a tarball, query_PSIBLAST.tar.gz) # # # ######################################################################################## # define the BLAST database. Note that you should specify the fasta file. db_path = os.path.join(databases_dir, "uniref\{db}\{db}.fasta".format(db=db)) for query in query_fasta_list: acc = os.path.basename(query).split(".")[0] # get first two letters of acc, used as a subfolder first2 = acc[:2] # create a basename, e.g. "D:\Databases\BLAST\PSI\vertebra90\P4\P42532" from which files are created basename = r"D:\Databases\BLAST\PSI\{db}\{first2}\{acc}".format( db=db, first2=first2, acc=acc) # create path for output files out_pssm = basename + ".pssm" out_ascii_pssm = basename + "_ascii.pssm" out_BLAST_xml = basename + "_BLAST.xml" date_file_path = basename + "_BLAST_date.txt" PSIBLAST_tar = basename + "_PSIBLAST.tar.gz" # if the tar exists or the accession has previously failed, skip to the next protein if os.path.exists(PSIBLAST_tar) and retry_successful == False: message = "{} PSIBLAST_tar exists, file skipped".format(acc) logging.info(message) continue if acc in failed_psiblast_list and retry_failed == False: message = "{} acc is in failed_psiblast_list, file skipped".format( acc) logging.info(message) continue if acc in timeout_psiblast_list and retry_timeout == False: message = "{} acc is in timeout_psiblast_list, file skipped".format( acc) logging.info(message) continue # print accession logging.info("\n{}".format(acc)) # create folders if necessary utils.make_sure_path_exists(out_ascii_pssm, isfile=True) # start a timer start = time.clock() ######################################################################################## # # # run the PSI-BLAST command-line argument # # # ######################################################################################## # create full command string to be run, as if in the console c = command_str.format(psiblast_exec_str=s["psiblast_exec_str"], query=query, db=db_path, out_pssm=out_pssm, out_ascii_pssm=out_ascii_pssm, out_BLAST_xml=out_BLAST_xml, evalue=s["evalue"], inclusion_ethresh=s["inclusion_ethresh"], num_threads=s["num_threads"], num_descriptions=s["num_descriptions"], num_alignments=s["num_alignments"]) logging.info("{}".format(c)) command = utils.Command(c) # Run the command. Set the timeout in seconds command.run(timeout=int(timeout_h * 60 * 60)) # wait 5 seconds. In some cases, the files are not immediately recognised as existing? utils.sleep_x_seconds(5, print_stuff=False) ######################################################################################## # # # if successful, move output files into a tarball # # # ######################################################################################## # check which output files exist (e.g. [True, True, False]) output_file_exists_list = [ os.path.exists(out_pssm), os.path.exists(out_ascii_pssm), os.path.exists(out_BLAST_xml) ] logging.info("pssm, ascii_pssm, xml exists : {}".format( output_file_exists_list)) # create boolean to catch timeout errors there_is_an_error_in_file_deletion = False # if all output files exist, create a date file and move all to a tarball if False not in output_file_exists_list: duration = time.clock() - start with open(date_file_path, "w") as f: f.write( "Acc\t{}\nDate\t{}\nDatabase\t{}\nGreeting\tHave a nice day!" .format(acc, date_string, db)) # move all files into the tarball file_list = [ out_pssm, out_ascii_pssm, out_BLAST_xml, date_file_path ] with tarfile.open(PSIBLAST_tar, mode='w:gz') as tar: # add the files to the compressed tarfile logging.info( '{} files will be moved into the tarball, original files deleted.\nPSIBLAST duration = {:0.3f} min' .format(acc, duration / 60)) for file in file_list: try: tar.add(file, arcname=os.path.basename(file)) except FileNotFoundError: # wait 10 seconds. In some cases, the files are not immediately recognised as existing? Very rare. utils.sleep_x_seconds(10, print_stuff=False) # here is where I should import goto, and goto the line above :) try: tar.add(file, arcname=os.path.basename(file)) except FileNotFoundError: # For whatever reason the file's still not there. Give up. logging.warning( '{}, file could not be added to tarball. Filepath = {}. ' .format(acc, file)) # wait 5 seconds. In some cases, the files are not immediately recognised as existing? utils.sleep_x_seconds(5, print_stuff=False) # delete the original files for file in file_list: try: os.remove(file) except (FileNotFoundError, PermissionError): logging.warning( '{} ERROR. Could not be deleted'.format(file)) there_is_an_error_in_file_deletion = True else: if acc not in failed_psiblast_list: # add accession number to the list of failed blast sequences with open(failed_psiblast_list_txt, "a") as source: source.write("\n{}".format(acc)) if there_is_an_error_in_file_deletion: if acc not in timeout_psiblast_list: # PSIBLAST probably timed out, files are incomplete # add accession number to the list of timed-out sequences with open(timeout_psiblast_list_txt, "a") as source: source.write("\n{}".format(acc))
def parse_TMSEG_results_DEPRECATED(pathdict, s, logging): """DEPRECATED METHOD BASED ON LARGE FILE OF ALL TMSEG RESULTS USE METHODS BASED ON INDIVIDUAL TMSEG DATAFILES INSTEAD. """ logging.info("~~~~~~~~~~~~ starting parse_TMSEG_results_DEPRECATED ~~~~~~~~~~~~") # create or open dataframe for protein list summary if os.path.isfile(pathdict["prot_list_summary_csv"]): df_PLS = pd.read_csv(pathdict["prot_list_summary_csv"], index_col=0) else: df_PLS = pd.DataFrame(columns=["v", "date"]) # get the timestamp for current time t = time.ctime(time.time()) list_number = s['list_number'] # define the uniprot directory with selected records uniprot_dir = os.path.join(s["data_dir"], 'uniprot') selected_uniprot_records_flatfile = os.path.join(uniprot_dir, 'selected', 'List%02d_selected_uniprot_records_flatfile.txt' % list_number) n_aa_before_tmd = s["n_aa_before_tmd"] n_aa_after_tmd = s["n_aa_after_tmd"] list_parsed_csv = pathdict["list_parsed_csv"] # check if the lists tab says to analyse the signal peptides analyse_sp = True if "SiPe" in s["regions"] else False output = korbinian.prot_list.uniprot_parse.parse_flatfile_to_csv(selected_uniprot_records_flatfile, n_aa_before_tmd, n_aa_after_tmd, analyse_sp, logging, list_parsed_csv, slice=False) logging.info(output) TMSEG_fastalike_path = pathdict['TMSEG_fastalike'] TMSEG_top_txtoutput_path = pathdict['TMSEG_top_txtoutput'] TMSEG_nonTM_outpath = pathdict['TMSEG_nonTM'] df_parsed = pd.read_csv(pathdict["list_parsed_csv"], sep=",", quoting=csv.QUOTE_NONNUMERIC, index_col=0, low_memory=False) columns_to_keep = ['organism_domain', 'uniprot_acc', 'uniprot_all_accessions', 'uniprot_entry_name', 'uniprot_features', 'uniprot_orgclass', 'uniprot_SiPe', 'singlepass', 'typeI', 'typeII', 'uniprot_KW', 'organism', 'prot_descr', 'membrane', 'multipass', 'gene_name', 'comments_subcellular_location_uniprot', 'uniprot_SiPe', 'full_seq'] # # for datasets without SP found, turn off analyse_sp # if analyse_sp == True and 'SP01_start' in df_parsed.columns: # columns_to_keep = ['SP01_start', 'SP01_end', 'SP01_seq'] # else: # analyse_sp == False acc_list_orig = list(df_parsed.index) if os.path.isfile(TMSEG_fastalike_path): df_PLS.loc["TMSEG_fastalike_path", :] = ("exists", t) sys.stdout.write("Extracting topology from TMSEG_fastalike file.") # DEPRECATED drop the full sequence, and get from TMSEG #df_parsed.drop('full_seq', axis=1, inplace=True) # read data from file # list will have acc, seq, topology, acc, seq, topology etc input_data = [] with open(TMSEG_fastalike_path) as data_file: for line in data_file: line = line.strip() if line[0] == '>': line = line[1:] line = line.split(' ') line = line[0].split('|') uniprot_acc = line[0] input_data.append(uniprot_acc) else: input_data.append(line) # initialise pandas dataframe with uniprot accession as index df_TMSEG = pd.DataFrame(index=input_data[0::3]) # add the signal peptide definitions from UniProt, to be used for slicing the nonTMD etc later if analyse_sp: for col in ['SP01_start', 'SP01_end', 'SP01_seq']: df_TMSEG[col] = df_parsed[col] # drop unnecessary columns from df_parsed, to be merged later df_parsed = df_parsed[columns_to_keep] # add selected columns from input_data list #df_TMSEG['uniprot_entry_name'] = input_data[1::5] #df_TMSEG['prot_descr'] = input_data[2::5] df_TMSEG['full_seq'] = input_data[1::3] df_TMSEG['topo'] = input_data[2::3] acc_list_TMSEG = df_TMSEG.index.tolist() TMSEG_avail_list = set(acc_list_TMSEG).intersection(set(acc_list_orig)) TMSEG_unavail_list = list(set(acc_list_orig) - set(acc_list_TMSEG)) df_PLS.loc["n_prot_TMSEG_file"] = (len(acc_list_TMSEG), t) # create a boolean whether the TMSEG topology is available df_parsed.loc[TMSEG_avail_list,"TMSEG_avail"] = True df_parsed.loc[TMSEG_unavail_list, "TMSEG_avail"] = False # drop proteins from df_TMSEG that are not in the listxx_parsed.csv df_TMSEG = df_TMSEG.loc[TMSEG_avail_list, :] fa_dir = pathdict['TMSEG_unavail_fa_dir'] utils.make_sure_path_exists(fa_dir) for acc in TMSEG_unavail_list: out_fasta = os.path.join(fa_dir, "{}.fasta".format(acc)) seq = df_parsed.loc[acc, "full_seq"] with open(out_fasta, "w") as f: f.write(">{}\n{}".format(acc, seq)) n_prot_TMSEG_file_not_in_list = len(set(acc_list_TMSEG) - set(acc_list_orig)) logging.info("n_prot_TMSEG_file_not_in_list as not in listxx_parsed.csv = {} ({} remaining)".format(n_prot_TMSEG_file_not_in_list, len(TMSEG_avail_list))) df_PLS.loc["n_prot_TMSEG_file_not_in_list"] = (n_prot_TMSEG_file_not_in_list, t) if df_TMSEG.shape[0] == 0: return sys.stdout.write('no remaining proteins in list!') # get list of uniprot accessions of proteins where no transmembrane region was predicted list_nonTMD = [] for acc in df_TMSEG.index: if 'N' in df_TMSEG.loc[acc, 'topo']: list_nonTMD.append(acc) # write list of nonTM proteins to file # outpath = '/Volumes/Musik/Databases/TMSEG/humanU90_nonTMD.txt' file = open(TMSEG_nonTM_outpath, 'w') for line in list_nonTMD: file.write('{}\n'.format(line)) file.close() # drop proteins that do not contain TM regions df_TMSEG = df_TMSEG.drop(list_nonTMD) # create a boolean whether the TMSEG topology is available TMSEG_avail_and_TM = set(TMSEG_avail_list) - set(list_nonTMD) TMSEG_avail_but_SOL = set(acc_list_orig).intersection(set(list_nonTMD)) df_parsed["membrane"] = np.nan df_parsed.loc[TMSEG_avail_and_TM, "membrane"] = True df_parsed.loc[TMSEG_avail_but_SOL, "membrane"] = False # add seqlen and indices for all TMD and SiPe regions df_TMSEG["seqlen"] = df_TMSEG.full_seq.apply(lambda x: len(x)) #df_TMSEG['M_indices'] = df_TMSEG.topo.apply(get_list_TM_residues_from_topo_string) #df_TMSEG['SiPe_indices'] = df_TMSEG.topo.apply(get_list_TM_residues_from_topo_string, args=("S")) df_TMSEG['TM_indices'] = df_TMSEG.topo.apply(get_TM_indices_from_TMSEG_topo_str) df_TMSEG['SiPe_indices'] = df_TMSEG.topo.apply(get_TM_indices_from_TMSEG_topo_str, args=("S")) # # Creating new list (nested list) # nested_list_of_membrane_borders = [] # # ######################################################################################## # # # # # Extract the membrane indices in UniProt Indexing style # # # # # ######################################################################################## # # Filling nest with lists of start and end-points # for m_index_list in df_TMSEG.M_indices: # m_borders = [] # # add the first membrane index (e.g. 13) # m_borders.append(m_index_list[0]) # m_borders = korbinian.prot_list.parse_OMPdb.check_for_border(m_index_list, m_borders) # # add the last membrane index (e.g. 33) # m_borders.append(m_index_list[-1]) # nested_list_of_membrane_borders.append(m_borders) # # # DEPRECATED # #FOR CONSISTENCY, LEAVE INDEXING STYLE AS UNIPROT # # ######################################################################################## # # # # # # # Convert to python indexing style (NECESSARY?? NOT COMPAT WITH UNIPROT!) # # # # # # # ######################################################################################## # # array_membrane_borders = np.array(nested_list_of_membrane_borders) # # nested_list_m_borders_python_indexstyle = [] # # for subarray in array_membrane_borders: # # # convert to array # # subarray = np.array(subarray) # # # add 1 to the second index number, to allow slicing # # subarray[1::2] = subarray[1::2] + 1 # # # add to list with corrected values, python index style # # nested_list_m_borders_python_indexstyle.append(list(subarray)) # # # Creating new column, which contains start and end-points # #df_TMSEG["Membrane_Borders"] = nested_list_m_borders_python_indexstyle # # df_TMSEG["Membrane_Borders"] = nested_list_of_membrane_borders # # # Creating new column, which contains the number of TMDS # #df_TMSEG["number_of_TMDs"] = df_TMSEG.Membrane_Borders.apply(lambda x: len(x) / 2) # # df_TMSEG["TM_indices"] = df_TMSEG["Membrane_Borders"].apply(lambda x: tuple(zip(x[::2], x[1::2]))) # create a list of [TM01, TM02, TM03, etc. long_list_of_TMDs = [] for i in range(1, 50): long_list_of_TMDs.append("TM{:02d}".format(i)) ## for the .set_value function, set dtype as object df_TMSEG["list_of_TMDs"] = "" df_TMSEG["list_of_TMDs"].astype(object) sys.stdout.write('slicing TMD and nonTMD sequences:\n') for n, acc in enumerate(df_TMSEG.index): # get nested tuple of TMDs nested_tup_TMs = df_TMSEG.loc[acc, "TM_indices"] # slice long list of TMD names to get an appropriate list for that protein [TM01, TM02, TM03, etc. len_nested_tup_TMs = len(nested_tup_TMs) list_of_TMDs = long_list_of_TMDs[:len_nested_tup_TMs] # add that list to the dataframe (could also be added as a stringlist, but that's irritating somehow) #df_TMSEG.loc[acc, 'list_of_TMDs'] = list_of_TMDs df_TMSEG.set_value(acc, "list_of_TMDs", list_of_TMDs) # set seq for slicing full_seq = df_TMSEG.loc[acc, "full_seq"] # topo = dft.loc[acc, "Topology"] # iterate through all the TMDs of that protein, slicing out the sequences for i, TMD in enumerate(list_of_TMDs): TMD = list_of_TMDs[i] start, end = nested_tup_TMs[i] # with UniProt indexing, need to slice with -1, not like python index style df_TMSEG.loc[acc, "%s_start" % TMD] = start df_TMSEG.loc[acc, "%s_end" % TMD] = end # for python indexing of the TMD rather than uniprot, the start should be minus 1 python_indexing_tuple = (start - 1, end) df_TMSEG.loc[acc, "%s_seq" % TMD] = utils.slice_with_listlike(full_seq, python_indexing_tuple) df_TMSEG.loc[acc, "%s_seqlen" % TMD] = len(df_TMSEG.loc[acc, "%s_seq" % TMD]) # dft.loc[acc, TMD + "_top"] = utils.slice_with_listlike(topo, tup) #DEPRECATED, ONLY REINSTATE IF YOU REALLY WANT TMSEG SP DEFINITIONS TO STAY # # add signal peptides and their corresponding values to list_of_TMDs # if analyse_sp == True: # if type(df_parsed.loc[acc, 'SP01_seq']) == str: # list_of_TMDs.append('SP01') # df_TMSEG.set_value(acc, "list_of_TMDs", list_of_TMDs) # # code necessary for TMSEG signal peptides - depreciated by MO 20.04.2017 # SiPe_indices = df_TMSEG.loc[acc, 'SiPe_indices'] # if SiPe_indices != []: # df_TMSEG.loc[acc, 'SP01_start'] = SiPe_indices[0] # df_TMSEG.loc[acc, 'SP01_end'] = SiPe_indices[-1] # df_TMSEG.loc[acc, 'SP01_seq'] = full_seq[SiPe_indices[0]:SiPe_indices[-1]+1] # list_of_TMDs.append('SP01') # df_TMSEG.set_value(acc, "list_of_TMDs", list_of_TMDs) if n % 50 == 0 and n != 0: sys.stdout.write(". ") sys.stdout.flush() if n % 500 == 0: sys.stdout.write("\n") sys.stdout.flush() # slice out the nonTM segments with a function # note that for some reason, this is very slow after merging the dataframes df_TMSEG = slice_nonTMD_in_prot_list(df_TMSEG) #df_TOP = pd.merge(df_parsed, df_TMSEG, how="left", left_on=True, suffixes=('_list_parsed', ""))# left_index=True, right_index=False, df_TOP = df_parsed.merge(df_TMSEG, how="left", suffixes=('_list_parsed', "")) # left_index=True, right_index=False, # actually, I'd prefer to keep these for troubleshooting purposes # cols_to_drop = ['M_indices', 'SiPe_indices', 'Membrane_Borders', 'TM_indices'] # df_TMSEG.drop(cols_to_drop, axis=1, inplace=True) elif os.path.isfile(TMSEG_top_txtoutput_path): df_PLS.loc["TMSEG_top_txtoutput_path", :] = ("exists", t) """ PARSE DATA WITH THE FOLLOWING FORMAT, proteins listed one after each other IMPORTANT : this format is sub-optimal, because the sequences come from uniprot, and the predictions from TMPRED Can only be trusted when they are from the same date: best to use TMPRED output which also contains the orig sequence. --- ID: A4ZUB1 # TRANSMEM 6 18 4 # TRANSMEM 50 67 7 SIG: SIGNAL 1 22 {ECO:0000255}. TMH: TRANSMEM 53 69 Helical. {ECO:0000255}. --- """ # if the regions column in the lists tab is "TM01" instead of the usual "TM", take only the first TM take_only_the_first_TM = s["regions"] == "TM01" # create dataframe for text topology (dftt) dftt = pd.DataFrame() with open(TMSEG_top_txtoutput_path, "r") as f: acc_counter = 0 for line in f: if line[0:4] == "ID: ": acc = line.split(" ")[1].strip("\n") dftt.loc[acc_counter, "acc"] = acc acc_counter += 1 # reset the TM_counter TM_counter = 1 if line[0:10] == "# TRANSMEM": if TM_counter > 1: if take_only_the_first_TM: # skip to next line, as the first TM is already taken continue # split by tab split = line.split("\t") # the start is split[1] (end is not really necessary here) start = split[1] # note that acc_counter += 1 is already + 1 for the next protein, # therefore the dftt.loc is acc_counter-1 dftt.loc[acc_counter - 1, "TM{:02d}_start".format(TM_counter)] = start end = split[2] # note that acc_counter += 1 is already + 1 for the next protein, # therefore the dftt.loc is acc_counter-1 dftt.loc[acc_counter - 1, "TM{:02d}_end".format(TM_counter)] = end TM_counter += 1 # add an extra number_of_TMDs column, so they can be counted consistently dftt["number_of_TMDs"] = 0 for row in dftt.index: # drop TM02_start etc if they don't contain data subset = dftt.loc[row, :].dropna() # count columns n_cols = subset.shape[0] # calculate number of columns (TM01_start, TM01_end) /2, which is the number of TMDs number_of_TMDs = int((n_cols - 2) / 2) dftt.loc[row, "number_of_TMDs"] = number_of_TMDs dftt.loc[row, "list_of_TMDs"] = str(["TM{:02d}".format(n) for n in range(1, number_of_TMDs + 1)]) # set the acc as the index, so it can be merged with df_parsed dftt.set_index("acc", drop=False, inplace=True) # save temp csv with TMSEG output TMSEG_txtoutput_parsed_csv = TMSEG_top_txtoutput_path[:-4] + "TMSEG_txtoutput_parsed.csv" dftt.to_csv(TMSEG_txtoutput_parsed_csv) df = pd.merge(dftt, df_parsed, left_index=True, right_index=True, suffixes=('', '_list_parsed')) # convert from string to python list if isinstance(df['list_of_TMDs'][0], str): df['list_of_TMDs'] = df['list_of_TMDs'].dropna().apply(lambda x: ast.literal_eval(x)) # (re)define sequence length df["seqlen"] = df["full_seq"].str.len() # slice out all the TMD sequences for n, acc in enumerate(df.index): list_of_TMDs = df.loc[acc, "list_of_TMDs"] # add that list to the dataframe (could also be added as a stringlist, but that's irritating somehow) # set seq for slicing full_seq = df.loc[acc, "full_seq"] # iterate through all the TMDs of that protein, slicing out the sequences for i in range(len(list_of_TMDs)): TMD = list_of_TMDs[i] tuple_slice_indices = (df.loc[acc, "%s_start" % TMD], df.loc[acc, "%s_end" % TMD]) df.loc[acc, "%s_seq" % TMD] = utils.slice_with_listlike(full_seq, tuple_slice_indices) df.loc[acc, "%s_seqlen" % TMD] = len(df.loc[acc, "%s_seq" % TMD]) # add signal peptides and their corresponding values to list_of_TMDs if analyse_sp == True: if type(df_parsed.loc[acc, 'SP01_seq']) == str: list_of_TMDs.append('SP01') df.set_value(acc, "list_of_TMDs", list_of_TMDs) start = time.clock() # slice out the nonTM segments with a function # note that for some reason, this is very slow after merging the dataframes df_TOP = slice_nonTMD_in_prot_list(df) sys.stdout.write("\ntime taken : {:0.03f} s".format(time.clock() - start)) else: raise FileNotFoundError("None of the TMSEG combined output files were found.") # define number of TMDs (includes Signal peptides!) df_TOP["number_of_TMDs"] = df_TOP["list_of_TMDs"].dropna().apply(lambda x : len(x)) df_TOP['parse_TMSEG'] = True df_TOP.to_csv(pathdict["list_parsed_csv"], sep=",", quoting=csv.QUOTE_NONNUMERIC) logging.info("\n~~~~~~~~~~~~ parse_TMSEG_results_DEPRECATED is finished ~~~~~~~~~~~~")