def delete(self, study_id, file_name): # query validation parser = reqparse.RequestParser() parser.add_argument('row_num', help="The row number of the cell(s) to remove (exclude header)", location="args") args = parser.parse_args() row_num = args['row_num'] # param validation if study_id is None or file_name is None or row_num is None: abort(404) study_id = study_id.upper() # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not write_access: abort(403) file_name = os.path.join(study_location, file_name) try: file_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") row_nums = row_num.split(",") # Need to remove the highest row number first as the DataFrame dynamically re-orders when one row is removed sorted_num_rows = [int(x) for x in row_nums] sorted_num_rows.sort(reverse=True) for num in sorted_num_rows: file_df = file_df.drop(file_df.index[num]) # Drop row(s) in the spreadsheet message = write_tsv(file_df, file_name) # To be sure we read the file again try: file_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") df_data_dict = totuples(file_df.reset_index(), 'rows') # Get an indexed header row df_header = get_table_header(file_df) return {'header': df_header, 'data': df_data_dict, 'message': message}
def get(self, study_id): log_request(request) # param validation if study_id is None: abort(404) study_id = study_id.upper() # User authentication user_token = None if 'user_token' in request.headers: user_token = request.headers['user_token'] if user_token is None: abort(401) # query validation parser = reqparse.RequestParser() parser.add_argument('filename1', help='TSV filename one') parser.add_argument('filename2', help='TSV filename two') assay_filename = None if request.args: args = parser.parse_args(req=request) filename1 = args['filename1'].lower( ) if args['filename1'] else None filename2 = args['filename2'].lower( ) if args['filename2'] else None if not filename1 or not filename2: logger.warning("Missing TSV filenames.") abort(404, "Missing TSV filenames.") # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not read_access: abort( 401, "Study does not exist or your do not have access to this study." ) location = study_location df1 = read_tsv(filename1) df2 = read_tsv(filename2) diff_df = diff_pd(df1, df2) return jsonify({"entries": diff_df})
def post(self, study_id, file_name): parser = reqparse.RequestParser() parser.add_argument('new_column_name', help="Name of new column") new_column_name = None parser.add_argument('new_column_position', help="The position (column #) of new column") new_column_position = None parser.add_argument('new_column_default_value', help="The (optional) default value of new column") new_column_default_value = None if request.args: args = parser.parse_args(req=request) new_column_name = args['new_column_name'] new_column_position = args['new_column_position'] new_column_default_value = args['new_column_default_value'] if new_column_name is None: abort(404, "Please provide valid name for the new column") # param validation if study_id is None or file_name is None: abort(404, 'Please provide valid parameters for study identifier and file name') study_id = study_id.upper() # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not write_access: abort(403) file_name = os.path.join(study_location, file_name) try: table_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") # Need to add values for each existing row (not header) new_col = [] for row_val in range(table_df.shape[0]): new_col.append(new_column_default_value) # Add new column to the spreadsheet table_df.insert(loc=int(new_column_position), column=new_column_name, value=new_col, allow_duplicates=True) df_data_dict = totuples(table_df.reset_index(), 'rows') # Get an indexed header row df_header = get_table_header(table_df) message = write_tsv(table_df, file_name) return {'header': df_header, 'data': df_data_dict, 'message': message}
def put(self, study_id, file_name): try: data_dict = json.loads(request.data.decode('utf-8')) columns_rows = data_dict['data'] except KeyError: columns_rows = None if columns_rows is None: abort(404, "Please provide valid key-value pairs for the cell value." "The JSON string has to have a 'data' element") # param validation if study_id is None or file_name is None: abort(404, 'Please provide valid parameters for study identifier and/or file name') study_id = study_id.upper() # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not write_access: abort(403) file_name = os.path.join(study_location, file_name) try: table_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") for column in columns_rows: cell_value = column['value'] row_index = column['row'] column_index = column['column'] # Need to add values for column and row (not header) try: #for row_val in range(table_df.shape[0]): table_df.iloc[int(row_index), int(column_index)] = cell_value except ValueError: abort(417, "Unable to find the required 'value', 'row' and 'column' values") # Write the new row back in the file message = write_tsv(table_df, file_name) df_data_dict = totuples(table_df.reset_index(), 'rows') # Get an indexed header row df_header = get_table_header(table_df) return {'header': df_header, 'rows': df_data_dict, 'message': message}
def split_metabolon_assays(study_location, study_id): p_start = 'a__POS' n_start = 'a__NEG' end = '_m' pos = p_start + end neg = n_start + end sample_col = 'Sample Name' for a_files in glob.glob( os.path.join(study_location, 'a__*_metabolite_profiling_mass_spectrometry.txt')): if pos in a_files: p_assay = read_tsv(a_files) p_filename = a_files try: # split based on 'POSEAR' and 'POSLAT' write_tsv( p_assay.loc[p_assay[sample_col].str.contains('POSEAR')], p_filename.replace(pos, p_start + '_1' + end)) write_tsv( p_assay.loc[p_assay[sample_col].str.contains('POSLAT')], p_filename.replace(pos, p_start + '_2' + end)) except: return False, "Failed to generate 2 POSITIVE ISA-Tab assay files for study " + study_id elif neg in a_files: n_assay = read_tsv(a_files) n_filename = a_files try: # split based on 'NEG' and 'POL' write_tsv(n_assay.loc[n_assay[sample_col].str.contains('NEG')], n_filename.replace(neg, n_start + '_1' + end)) write_tsv(n_assay.loc[n_assay[sample_col].str.contains('POL')], n_filename.replace(neg, n_start + '_2' + end)) except: return False, "Failed to generate 2 NEGATIVE ISA-Tab assay files for study " + study_id status, message = True, "Generated 4 ISA-Tab assay files for study " + study_id return status, message
def delete(self, study_id, file_name): # param validation if study_id is None or file_name is None: abort(417, "Please provide a study id and TSV file name") fname, ext = os.path.splitext(file_name) ext = ext.lower() if ext not in ('.tsv', '.csv', '.txt'): abort(400, "The file " + file_name + " is not a valid TSV or CSV file") try: data_dict = json.loads(request.data.decode('utf-8')) delete_columns = data_dict['data'] except Exception as e: abort(417, str(e)) # param validation columns = delete_columns['columns'] if columns is None: abort(417, 'Please ensure the JSON contains a "columns" element') # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, study_status = \ wsc.get_permissions(study_id, user_token) if not write_access: abort(403) audit_status, dest_path = write_audit_files(study_location) for column in columns: tsv_file = os.path.join(study_location, file_name) if not os.path.isfile(tsv_file): abort(406, "File " + file_name + " does not exist") else: file_df = read_tsv(tsv_file) try: file_df.drop(column, axis=1, inplace=True) write_tsv(file_df, tsv_file) except Exception as e: logger.error("Could not remove column '" + column + "' from file " + file_name) logger.error(str(e)) return {"Success": "Removed column(s) from " + file_name}
def post(self, study_id): data_dict = json.loads(request.data.decode('utf-8')) assay_file_names = data_dict['data'] # param validation if study_id is None: abort(417) # param validation if assay_file_names is None: abort(417, 'Please ensure the JSON has at least one "assay_file_name" element') # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] logger.info('MAF: Getting ISA-JSON Study %s', study_id) # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not read_access: abort(403) maf_feedback = "" for assay_file_name in assay_file_names: annotation_file_name = None assay_file = assay_file_name['assay_file_name'] full_assay_file_name = os.path.join(study_location, assay_file) if not os.path.isfile(full_assay_file_name): abort(406, "Assay file " + assay_file + " does not exist") assay_df = read_tsv(full_assay_file_name) annotation_file_name = assay_df['Metabolite Assignment File'].iloc[0] maf_df, new_annotation_file_name, new_column_counter = \ create_maf(None, study_location, assay_file, annotation_file_name=annotation_file_name) if annotation_file_name != new_annotation_file_name: assay_df['Metabolite Assignment File'] = new_annotation_file_name write_tsv(assay_df, full_assay_file_name) annotation_file_name = new_annotation_file_name if maf_df.empty: abort(406, "MAF file could not be created or updated") maf_feedback = maf_feedback + ". New row(s):" + str(new_column_counter) + " for assay file " + \ annotation_file_name return {"success": "Added/Updated MAF(s)" + maf_feedback}
def get(self, study_id, file_name): # param validation if study_id is None or file_name is None: logger.info('No study_id and/or TSV file name given') abort(404) fname, ext = os.path.splitext(file_name) ext = ext.lower() if ext not in ('.tsv', '.csv', '.txt'): abort(400, "The file " + file_name + " is not a valid TSV or CSV file") study_id = study_id.upper() file_name_param = file_name # store the passed filename for simplicity # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] logger.info('Assay Table: Getting ISA-JSON Study %s', study_id) # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not read_access: abort(403) if file_name == 'metabolights_zooma.tsv': # This will edit the MetaboLights Zooma mapping file if not is_curator: abort(403) file_name = app.config.get('MTBLS_ZOOMA_FILE') else: file_name = os.path.join(study_location, file_name) logger.info('Trying to load TSV file (%s) for Study %s', file_name, study_id) # Get the Assay table or create a new one if it does not already exist try: file_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") df_data_dict = totuples(file_df.reset_index(), 'rows') # Get an indexed header row df_header = get_table_header(file_df, study_id, file_name_param) return {'header': df_header, 'data': df_data_dict}
def update_characteristics_in_sample_sheet(onto_name, new_url, header, old_value, new_value, study_location, isa_study): try: """ Update column values in sample file(s). The column header looks like 'Characteristics[<characteristics name>'] """ sample_file_name = os.path.join(study_location, isa_study.filename) # Sample sheet header = 'Characteristics[' + header + ']' if sample_file_name: df = read_tsv(sample_file_name) ''' This is slightly complicated in a DF, identical columns are separated with .n. "Organism part" should always be the 2nd group of columns, but to be sure we should use the column position (col_pos) ''' col_pos = df.columns.get_loc( header ) # Use this to determine the location of the additional columns header_source_ref = df.columns[col_pos + 1] # 'Term Source REF' (+.n) header_acc_number = df.columns[col_pos + 2] # 'Term Accession Number' (+.n) try: # if old_value != new_value: # Do we need to change the cell values? df.loc[ df[header] == old_value, header_source_ref] = onto_name # Term Source REF(.n) changed df.loc[ df[header] == old_value, header_acc_number] = new_url # Term Accession Number(.n) changed df.loc[df[header] == old_value, header] = new_value # Characteristics name changed write_tsv(df, sample_file_name) logger.info(old_value + " " + new_value + " has been renamed in " + sample_file_name) except Exception as e: logger.warning( old_value + " " + new_value + " was not used in the sheet or we failed updating " + sample_file_name + ". Error: " + str(e)) except Exception as e: logger.error("Could not update the ontology value " + old_value + " in " + sample_file_name)
def check_maf_for_pipes(study_location, annotation_file_name): annotation_file_name = os.path.join(study_location, annotation_file_name) try: maf_df = read_tsv(annotation_file_name) except FileNotFoundError: abort(400, "The file " + annotation_file_name + " was not found") maf_len = len(maf_df.index) # Any rows to split? new_maf_df = split_rows(maf_df) new_maf_len = len(new_maf_df.index) file_name = annotation_file_name + '.split' if maf_len != new_maf_len: # We did find |, so we create a new MAF write_tsv(new_maf_df, file_name) return maf_df, maf_len, new_maf_df, new_maf_len, file_name
def read_characteristics_from_sample_sheet(study_location, isa_study): sample_orgs = [] try: sample_file_name = os.path.join(study_location, isa_study.filename) # Sample sheet if sample_file_name: df = read_tsv(sample_file_name) ''' This is slightly complicated in a DF, identical columns are separated with .n. "Organism part" should always be the 2nd group of columns, but to be sure we should use the column position (col_pos) ''' col_pos1 = df.columns.get_loc( 'Characteristics[Organism]' ) # Use this to determine the location of the additional columns header_source_ref1 = df.columns[col_pos1 + 1] # 'Term Source REF' header_acc_number1 = df.columns[col_pos1 + 2] # 'Term Accession Number' col_pos2 = df.columns.get_loc('Characteristics[Organism part]') header_source_ref2 = df.columns[col_pos2 + 1] # 'Term Source REF' (+.n) header_acc_number2 = df.columns[col_pos2 + 2] # 'Term Accession Number' (+.n) new_df = df[[ 'Characteristics[Organism]', header_source_ref1, header_acc_number1, 'Characteristics[Organism part]', header_source_ref2, header_acc_number2 ]].copy() new_df.columns = [ 'Characteristics[Organism]', 'Term Source REF', 'Term Accession Number', 'Characteristics[Organism part]', 'Term Source REF.1', 'Term Accession Number.1' ] return new_df.drop_duplicates() except Exception as e: logger.error( "Could not read 'Characteristics[Organism]' and/or 'Characteristics[Organism part]' in " + sample_file_name) abort(400)
def get(self, study_id, file_name): # param validation if study_id is None or file_name is None: logger.info('No study_id and/or TSV file name given') abort(404) study_id = study_id.upper() file_name_param = file_name # store the passed filename for simplicity # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] logger.info('Assay Table: Getting ISA-JSON Study %s', study_id) # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not read_access: abort(403) if file_name == 'metabolights_zooma.tsv': # This will edit the MetaboLights Zooma mapping file if not is_curator: abort(403) file_name = app.config.get('MTBLS_ZOOMA_FILE') else: file_name = os.path.join(study_location, file_name) logger.info('Trying to load TSV file (%s) for Study %s', file_name, study_id) # Get the Assay table or create a new one if it does not already exist try: file_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") df_data_dict = totuples(file_df.reset_index(), 'rows') # Get an indexed header row df_header = get_table_header(file_df, study_id, file_name_param) return {'header': df_header, 'data': df_data_dict}
def put(self, study_id, file_name): # param validation if study_id is None or file_name is None: abort( 406, 'Please provide valid parameters for study identifier and TSV file name' ) fname, ext = os.path.splitext(file_name) ext = ext.lower() if ext not in ('.tsv', '.csv', '.txt'): abort(400, "The file " + file_name + " is not a valid TSV or CSV file") study_id = study_id.upper() try: data_dict = json.loads(request.data.decode('utf-8')) new_rows = data_dict[ 'data'] # Use "index:n" element, this is the original row number except KeyError: new_rows = None if new_rows is None: abort( 404, "Please provide valid data for updated new row(s). " "The JSON string has to have a 'data' element") for row in new_rows: try: row_index = row[ 'index'] # Check if we have a value in the row number(s) except (KeyError, Exception): row_index = None if new_rows is None or row_index is None: abort( 404, "Please provide valid data for the updated row(s). " "The JSON string has to have an 'index:n' element in each (JSON) row. " "The header row can not be updated") # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not write_access: abort(403) file_name = os.path.join(study_location, file_name) try: file_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") for row in new_rows: try: row_index_int = int(row['index']) except: row_index_int is None # Validate column names in new rows valid_column_name, message = validate_row(file_df, row, 'put') if not valid_column_name: abort(417, message) if row_index_int is not None: file_df = file_df.drop( file_df.index[row_index_int] ) # Remove the old row from the spreadsheet # pop the "index:n" from the new_row before updating row.pop( 'index', None ) # Remove "index:n" element, this is the original row number file_df = insert_row(row_index_int, file_df, row) # Update the row in the spreadsheet message = write_tsv(file_df, file_name) df_data_dict = totuples(file_df.reset_index(), 'rows') # Get an indexed header row df_header = get_table_header(file_df) return {'header': df_header, 'data': df_data_dict, 'message': message}
def post(self, study_id, file_name): log_request(request) try: data_dict = json.loads(request.data.decode('utf-8')) data = data_dict['data'] new_row = data['rows'] except KeyError: new_row = None data = None if new_row is None: abort( 417, "Please provide valid data for updated new row(s). The JSON string has to have a 'rows' element" ) try: for element in new_row: element.pop( 'index', None ) # Remove "index:n" element, this is the original row number except: logger.info('No index (row num) supplied, ignoring') # param validation if study_id is None or file_name is None: abort( 404, 'Please provide valid parameters for study identifier and TSV file name' ) fname, ext = os.path.splitext(file_name) ext = ext.lower() if ext not in ('.tsv', '.csv', '.txt'): abort(400, "The file " + file_name + " is not a valid TSV or CSV file") study_id = study_id.upper() # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not write_access: abort(403) if file_name == 'metabolights_zooma.tsv': # This will edit the MetaboLights Zooma mapping file if not is_curator: abort(403) file_name = app.config.get('MTBLS_ZOOMA_FILE') else: file_name = os.path.join(study_location, file_name) try: file_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file name was not found") # Validate column names in new rows valid_column_name, message = validate_row(file_df, new_row, "post") if not valid_column_name: abort(417, message) if data: try: start_index = data['index'] if start_index == -1: start_index = 0 start_index = start_index - 0.5 except KeyError: start_index = len(file_df.index) # Map the complete row first, update with new_row complete_row = {} for col in file_df.columns: complete_row[col] = "" if not new_row: logger.warning( "No new row information provided. Adding empty row " + file_name + ", row " + str(complete_row)) else: for row in new_row: complete_row.update(row) row = complete_row line = pd.DataFrame(row, index=[start_index]) file_df = file_df.append(line, ignore_index=False) file_df = file_df.sort_index().reset_index(drop=True) start_index += 1 file_df = file_df.replace(np.nan, '', regex=True) message = write_tsv(file_df, file_name) # Get an indexed header row df_header = get_table_header(file_df) # Get the updated data table try: df_data_dict = totuples(read_tsv(file_name), 'rows') except FileNotFoundError: abort(400, "The file " + file_name + " was not found") return {'header': df_header, 'data': df_data_dict, 'message': message}
def put(self, study_id, file_name): try: data_dict = json.loads(request.data.decode('utf-8')) columns_rows = data_dict['data'] except KeyError: columns_rows = None if columns_rows is None: abort( 404, "Please provide valid key-value pairs for the cell value." "The JSON string has to have a 'data' element") # param validation if study_id is None or file_name is None: abort( 404, 'Please provide valid parameters for study identifier and/or file name' ) fname, ext = os.path.splitext(file_name) ext = ext.lower() if ext not in ('.tsv', '.csv', '.txt'): abort(400, "The file " + file_name + " is not a valid TSV or CSV file") study_id = study_id.upper() # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not write_access: abort(403) file_name = os.path.join(study_location, file_name) try: table_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") for column in columns_rows: cell_value = column['value'] row_index = column['row'] column_index = column['column'] # Need to add values for column and row (not header) try: # for row_val in range(table_df.shape[0]): table_df.iloc[int(row_index), int(column_index)] = cell_value except ValueError as e: logger.error( "(ValueError) Unable to find the required 'value', 'row' and 'column' values. Value: " + cell_value + ", row: " + row_index + ", column: " + column + ". " + str(e)) abort( 417, "(ValueError) Unable to find the required 'value', 'row' and 'column' values. Value: " + cell_value + ", row: " + row_index + ", column: " + column) except IndexError: logger.error( "(IndexError) Unable to find the required 'value', 'row' and 'column' values. Value: " + cell_value + ", row: " + row_index + ", column: " + column + ". " + str(e)) abort( 417, "(IndexError) Unable to find the required 'value', 'row' and 'column' values. Value: " + cell_value + ", row: " + row_index + ", column: " + column) # Write the new row back in the file message = write_tsv(table_df, file_name) df_data_dict = totuples(table_df.reset_index(), 'rows') # Get an indexed header row df_header = get_table_header(table_df) return {'header': df_header, 'rows': df_data_dict, 'message': message}
def post(self, study_id, file_name): try: data_dict = json.loads(request.data.decode('utf-8')) new_columns = data_dict['data'] except KeyError: new_columns = None if new_columns is None: abort( 417, "Please provide valid key-value pairs for the new columns." "The JSON string has to have a 'data' element") # param validation if study_id is None or file_name is None: abort( 404, 'Please provide valid parameters for study identifier and/or file name' ) study_id = study_id.upper() fname, ext = os.path.splitext(file_name) ext = ext.lower() if ext not in ('.tsv', '.csv', '.txt'): abort(400, "The file " + file_name + " is not a valid TSV or CSV file") # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not write_access: abort(403) file_name = os.path.join(study_location, file_name) try: table_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") audit_status, dest_path = write_audit_files(study_location) # Get an indexed header row df_header = get_table_header(table_df) for column in new_columns: new_column_default_value = column['value'] new_column_name = column['name'] new_column_position = column['index'] # Need to add values for each existing row (not header) new_col = [] for row_val in range(table_df.shape[0]): new_col.append(new_column_default_value) # Check if we already have the column in the current position try: header_name = table_df.iloc[:, new_column_position].name except: header_name = "" if header_name == new_column_name: # We should update the existing column table_df.iloc[:, new_column_position] = new_col else: # Add new column to the spreadsheet table_df.insert(loc=int(new_column_position), column=new_column_name, value=new_col, allow_duplicates=True) # Get an (updated) indexed header row df_header = get_table_header(table_df) # Get all indexed rows df_data_dict = totuples(table_df.reset_index(), 'rows') message = write_tsv(table_df, file_name) return {'header': df_header, 'rows': df_data_dict, 'message': message}
def delete(self, study_id, file_name): # query validation parser = reqparse.RequestParser() parser.add_argument( 'row_num', help="The row number of the cell(s) to remove (exclude header)", location="args") args = parser.parse_args() row_num = args['row_num'] # param validation if study_id is None or file_name is None or row_num is None: abort(404) fname, ext = os.path.splitext(file_name) ext = ext.lower() if ext not in ('.tsv', '.csv', '.txt'): abort(400, "The file " + file_name + " is not a valid TSV or CSV file") study_id = study_id.upper() # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not write_access: abort(403) file_name = os.path.join(study_location, file_name) try: file_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") row_nums = row_num.split(",") # Need to remove the highest row number first as the DataFrame dynamically re-orders when one row is removed sorted_num_rows = [int(x) for x in row_nums] sorted_num_rows.sort(reverse=True) for num in sorted_num_rows: file_df = file_df.drop( file_df.index[num]) # Drop row(s) in the spreadsheet message = write_tsv(file_df, file_name) # To be sure we read the file again try: file_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") df_data_dict = totuples(file_df.reset_index(), 'rows') # Get an indexed header row df_header = get_table_header(file_df) return {'header': df_header, 'data': df_data_dict, 'message': message}
def post(self, study_id, file_name): try: data_dict = json.loads(request.data.decode('utf-8')) new_columns = data_dict['data'] except KeyError: new_columns = None if new_columns is None: abort(417, "Please provide valid key-value pairs for the new columns." "The JSON string has to have a 'data' element") # param validation if study_id is None or file_name is None: abort(404, 'Please provide valid parameters for study identifier and/or file name') study_id = study_id.upper() # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not write_access: abort(403) file_name = os.path.join(study_location, file_name) try: table_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") # Get an indexed header row df_header = get_table_header(table_df) for column in new_columns: new_column_default_value = column['value'] new_column_name = column['name'] new_column_position = column['index'] # Need to add values for each existing row (not header) new_col = [] for row_val in range(table_df.shape[0]): new_col.append(new_column_default_value) # Check if we already have the column in the current position try: header_name = table_df.iloc[:, new_column_position].name except: header_name = "" if header_name == new_column_name: # We should update the existing column table_df.iloc[:, new_column_position] = new_col else: # Add new column to the spreadsheet table_df.insert(loc=int(new_column_position), column=new_column_name, value=new_col, allow_duplicates=True) # Get an (updated) indexed header row df_header = get_table_header(table_df) # Get all indexed rows df_data_dict = totuples(table_df.reset_index(), 'rows') message = write_tsv(table_df, file_name) return {'header': df_header, 'rows': df_data_dict, 'message': message}
def post(self, study_id, file_name): log_request(request) try: data_dict = json.loads(request.data.decode('utf-8')) new_row = data_dict['data'] except KeyError: new_row = None if new_row is None: abort(417, "Please provide valid data for updated new row(s). The JSON string has to have a 'data' element") try: for element in new_row: element.pop('index', None) # Remove "index:n" element, this is the original row number except: logger.info('No index (row num) supplied, ignoring') # param validation if study_id is None or file_name is None: abort(404, 'Please provide valid parameters for study identifier and TSV file name') study_id = study_id.upper() # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not write_access: abort(403) if file_name == 'metabolights_zooma.tsv': # This will edit the MetaboLights Zooma mapping file if not is_curator: abort(403) file_name = app.config.get('MTBLS_ZOOMA_FILE') else: file_name = os.path.join(study_location, file_name) try: file_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file name was not found") # Validate column names in new rows valid_column_name, message = validate_row(file_df, new_row, "post") if not valid_column_name: abort(417, message) if new_row[0]: file_df = file_df.append(new_row, ignore_index=True) # Add new row to the spreadsheet (TSV file) else: file_df = file_df.append(pd.Series(), ignore_index=True) message = write_tsv(file_df, file_name) # Get an indexed header row df_header = get_table_header(file_df) # Get the updated data table try: df_data_dict = totuples(read_tsv(file_name), 'rows') except FileNotFoundError: abort(400, "The file " + file_name + " was not found") return {'header': df_header, 'data': df_data_dict, 'message': message}
def search_and_update_maf(study_location, annotation_file_name): sdf_file_list = [] exiting_pubchem_file = False short_file_name = os.path.join(study_location, annotation_file_name.replace('.tsv', '')) if annotation_file_name.endswith(pubchem_end): exiting_pubchem_file = True short_file_name = os.path.join(study_location, annotation_file_name.replace(pubchem_end, '')) annotation_file_name = os.path.join(study_location, annotation_file_name) pd.options.mode.chained_assignment = None # default='warn' standard_maf_columns = {"database_identifier": 0, "chemical_formula": 1, "smiles": 2, "inchi": 3} maf_compound_name_column = "metabolite_identification" try: maf_df = read_tsv(annotation_file_name) except FileNotFoundError: abort(400, "The file " + annotation_file_name + " was not found") maf_len = len(maf_df.index) # First make sure the existing pubchem annotated spreadsheet is loaded pubchem_df = maf_df.copy() if exiting_pubchem_file: # The has already been split and this is an existing "pubchem" file new_maf_df = maf_df.copy() new_maf_len = len(new_maf_df.index) else: # Any rows to split? new_maf_df = split_rows(maf_df) new_maf_len = len(new_maf_df.index) if maf_len != new_maf_len: # We did find | so we have to use the new dataframe maf_df = new_maf_df # Remove existing row values first, because that's what we do ;-) for column_name in standard_maf_columns: maf_df.iloc[:, standard_maf_columns[column_name]] = "" pubchem_df = create_pubchem_df(maf_df) row_idx = 0 short_df = maf_df[["database_identifier", maf_compound_name_column]] # Search using the compound name column for idx, row in short_df.iterrows(): database_id = row[0] comp_name = row[1] print(str(idx+1) + ' of ' + str(new_maf_len) + ' : ' + comp_name) if not database_id: start_time = time.time() chebi_found = False comp_name = comp_name.rstrip() # Remove trailing spaces # comp_name = comp_name.encode('ascii', 'ignore') # Make sure it's only searching using ASCII encoding if '/' in comp_name: # Not a real name comp_name = comp_name.replace('/', ' ') search_res = wsc.get_maf_search("name", comp_name) # This is the standard MetaboLights aka Plugin search if search_res['content']: result = search_res['content'][0] database_identifier = result["databaseId"] chemical_formula = result["formula"] smiles = result["smiles"] inchi = result["inchi"] name = result["name"] pubchem_df.iloc[row_idx, 0] = database_identifier pubchem_df.iloc[row_idx, 1] = chemical_formula pubchem_df.iloc[row_idx, 2] = smiles pubchem_df.iloc[row_idx, 3] = inchi # 4 is name / metabolite_identification from MAF if name: if database_identifier: if database_identifier.startswith('CHEBI:'): chebi_found = True logger.info(" -- Found ChEBI id " + database_identifier + " based on name") print(" -- Found ChEBI id " + database_identifier + " based on name") maf_df.iloc[row_idx, int(standard_maf_columns['database_identifier'])] = database_identifier if chemical_formula: maf_df.iloc[row_idx, int(standard_maf_columns['chemical_formula'])] = chemical_formula if smiles: maf_df.iloc[row_idx, int(standard_maf_columns['smiles'])] = smiles if inchi: maf_df.iloc[row_idx, int(standard_maf_columns['inchi'])] = inchi if not chebi_found: # We could not find this in ChEBI, let's try other sources pc_name, pc_inchi, pc_inchi_key, pc_smiles, pc_cid, pc_formula, pc_synonyms, pc_structure = \ pubchem_search(comp_name, study_location) cactus_stdinchikey = cactus_search(comp_name, 'stdinchikey') opsin_stdinchikey = opsin_search(comp_name, 'stdinchikey') cactus_smiles = cactus_search(comp_name, 'smiles') opsin_smiles = opsin_search(comp_name, 'smiles') cactus_inchi = cactus_search(comp_name, 'stdinchi') opsin_inchi = opsin_search(comp_name, 'stdinchi') cactus_synonyms = cactus_search(comp_name, 'names') # Synonyms ik = cactus_stdinchikey if pc_inchi_key: ik = pc_inchi_key csid = get_csid(ik) pubchem_df.iloc[row_idx, 5] = pc_name # 5 PubChem name pubchem_df.iloc[row_idx, 6] = pc_cid # 6 PubChem CID if not pc_cid: pc_cid = get_pubchem_cid_on_inchikey(cactus_stdinchikey, opsin_stdinchikey) pubchem_df.iloc[row_idx, 7] = pc_cid # 7 PubChem CID, if none get from InChIKey search (Cactus, OBSIN) pubchem_df.iloc[row_idx, 8] = csid # 8 ChemSpider ID (CSID) from INCHI pubchem_df.iloc[row_idx, 9] = get_ranked_values(pc_smiles, cactus_smiles, opsin_smiles, None) # 9 final smiles final_inchi = get_ranked_values(pc_inchi, cactus_inchi, opsin_inchi, None) # 10 final inchi pubchem_df.iloc[row_idx, 10] = final_inchi final_inchi_key = get_ranked_values(pc_inchi_key, cactus_stdinchikey, opsin_stdinchikey, None) # 11 final inchikey pubchem_df.iloc[row_idx, 11] = final_inchi_key pubchem_df.iloc[row_idx, 12] = pc_smiles # 12 pc_smiles pubchem_df.iloc[row_idx, 13] = cactus_smiles # 13 cactus_smiles pubchem_df.iloc[row_idx, 14] = opsin_smiles # 14 opsin_smiles pubchem_df.iloc[row_idx, 15] = pc_inchi # 15 PubChem inchi pubchem_df.iloc[row_idx, 16] = cactus_inchi # 16 Cacus inchi pubchem_df.iloc[row_idx, 17] = opsin_inchi # 17 Opsin inchi pubchem_df.iloc[row_idx, 18] = pc_inchi_key # 18 PubChem stdinchikey pubchem_df.iloc[row_idx, 19] = cactus_stdinchikey # 19 cactus_stdinchikey pubchem_df.iloc[row_idx, 20] = opsin_stdinchikey # 20 opsin_stdinchikey pubchem_df.iloc[row_idx, 21] = pc_formula # 21 PubChem formula pubchem_df.iloc[row_idx, 22] = pc_synonyms # 22 PubChem synonyms pubchem_df.iloc[row_idx, 23] = cactus_synonyms # 23 Cactus synonyms pubchem_df.iloc[row_idx, 24] = pc_structure # 24 PubChem structure (SDF) # Now we have more information, so let's try to search ChEBI again if final_inchi_key and len(final_inchi_key) > 0: chebi_id, inchi, inchikey, name, smiles, formula = direct_chebi_search(final_inchi_key, comp_name) if chebi_id: database_identifier = chebi_id chemical_formula = formula smiles = smiles inchi = inchi name = name logger.info(" -- Found ChEBI id " + database_identifier + " based on final InChIKey") print(' -- Found ChEBI id ' + database_identifier + ' based on final InChIKey') pubchem_df.iloc[row_idx, 0] = database_identifier pubchem_df.iloc[row_idx, 1] = chemical_formula pubchem_df.iloc[row_idx, 2] = smiles pubchem_df.iloc[row_idx, 3] = inchi # 4 is name / metabolite_identification from MAF if name: # Add to the annotated file as well if database_identifier: maf_df.iloc[row_idx, int(standard_maf_columns['database_identifier'])] = database_identifier if chemical_formula: maf_df.iloc[row_idx, int(standard_maf_columns['chemical_formula'])] = chemical_formula if smiles: maf_df.iloc[row_idx, int(standard_maf_columns['smiles'])] = smiles if inchi: maf_df.iloc[row_idx, int(standard_maf_columns['inchi'])] = inchi else: # Now, if we still don't have a ChEBI accession, download the structure (SDF) from PubChem # and the classyFire SDF sdf_file_list = get_sdf(study_location, pc_cid, pc_name, sdf_file_list, final_inchi) logger.info(" -- Search took %s seconds" % round(time.time() - start_time, 2)) print(" -- Search took %s seconds" % round(time.time() - start_time, 2)) else: print(" -- Skipping. Found database id " + database_id) row_idx += 1 write_tsv(maf_df, short_file_name + "_annotated.tsv") pubchem_file = short_file_name + pubchem_end write_tsv(pubchem_df, pubchem_file) if sdf_file_list: concatenate_sdf_files(sdf_file_list, short_file_name + '_complete.sdf', short_file_name + '_classyfire.sdf') return maf_df, maf_len, new_maf_df, new_maf_len, pubchem_file
def post(self, study_id, file_name): parser = reqparse.RequestParser() parser.add_argument('new_column_name', help="Name of new column") new_column_name = None parser.add_argument('new_column_position', help="The position (column #) of new column") new_column_position = None parser.add_argument('new_column_default_value', help="The (optional) default value of new column") new_column_default_value = None if request.args: args = parser.parse_args(req=request) new_column_name = args['new_column_name'] new_column_position = args['new_column_position'] new_column_default_value = args['new_column_default_value'] if new_column_name is None: abort(404, "Please provide valid name for the new column") # param validation if study_id is None or file_name is None: abort( 404, 'Please provide valid parameters for study identifier and file name' ) study_id = study_id.upper() fname, ext = os.path.splitext(file_name) ext = ext.lower() if ext not in ('.tsv', '.csv', '.txt'): abort(400, "The file " + file_name + " is not a valid TSV or CSV file") # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not write_access: abort(403) file_name = os.path.join(study_location, file_name) try: table_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") audit_status, dest_path = write_audit_files(study_location) # Need to add values for each existing row (not header) new_col = [] for row_val in range(table_df.shape[0]): new_col.append(new_column_default_value) # Add new column to the spreadsheet table_df.insert(loc=int(new_column_position), column=new_column_name, value=new_col, allow_duplicates=True) df_data_dict = totuples(table_df.reset_index(), 'rows') # Get an indexed header row df_header = get_table_header(table_df) message = write_tsv(table_df, file_name) return {'header': df_header, 'data': df_data_dict, 'message': message}
def update_maf_stats(user_token): #database_maf_info_table_actions() # Truncate, drop and create the database table for acc in get_all_study_acc(): study_id = acc[0] maf_len = 0 sample_len = 0 assay_len = 0 print("------------------------------------------ " + study_id + " ------------------------------------------") try: database_maf_info_table_actions(study_id) except ValueError: logger.error("Failed to update database for " + study_id) continue is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) try: isa_study, isa_inv, std_path = iac.get_isa_study(study_id=study_id, api_key=user_token, skip_load_tables=True, study_location=study_location) except Exception as e: logger.error("Failed to load ISA-Tab files for study " + study_id + ". " + str(e)) continue # Cannot find the required metadata files, skip to the next study try: number_of_files = sum([len(files) for r, d, files in os.walk(study_location)]) except: number_of_files = 0 try: smaple_file_name = isa_study.filename sample_df = read_tsv(os.path.join(study_location, smaple_file_name)) sample_len = sample_df.shape[0] except FileNotFoundError: logger.warning('No sample file found for ' + study_id) for assay in isa_study.assays: complete_maf = [] file_name = os.path.join(study_location, assay.filename) logger.info('Trying to load TSV file (%s) for Study %s', file_name, study_id) # Get the Assay table or create a new one if it does not already exist try: assay_file_df = read_tsv(file_name) except Exception as e: logger.error("The file " + file_name + " was not found") try: assay_len = assay_len + assay_file_df.shape[0] assay_maf_name = assay_file_df['Metabolite Assignment File'].iloc[0] if not assay_maf_name: continue # No MAF referenced in this assay except Exception: logger.error("Error in identifying MAF column in assay") continue # No MAF column found in this assay maf_file_name = os.path.join(study_location, assay_maf_name) # MAF sheet if os.path.isfile(maf_file_name): try: maf_df = read_tsv(maf_file_name) except Exception as e: logger.error("The file " + maf_file_name + " was not found") print(study_id + " - Rows: " + str(len(maf_df)) + ". File: " + maf_file_name) else: print("Could not find file " + maf_file_name) continue maf_len = maf_len + maf_df.shape[0] for idx, row in maf_df.iterrows(): maf_row = {} try: database_identifier = row['database_identifier'] metabolite_identification = row['metabolite_identification'] maf_row.update({"acc": study_id}) maf_row.update({"database_identifier": database_identifier}) maf_row.update({"metabolite_identification": metabolite_identification}) maf_row.update({"database_found": is_identified(database_identifier)}) maf_row.update({"metabolite_found": is_identified(metabolite_identification)}) except Exception as e: logger.error('MAF stats failed for ' + study_id + '. Error: ' + str(e)) continue complete_maf.append(maf_row) status, msg = update_database_stats(complete_maf) # Update once per MAF study_sql = "UPDATE STUDIES SET sample_rows = " + str(sample_len) + ", assay_rows = " + str(assay_len) + \ ", maf_rows = " + str(maf_len) + ", number_of_files = " + str(number_of_files) + \ " WHERE ACC = '" + str(study_id) + "';" status, msg = insert_update_data(study_sql) print("Database updated: " + study_sql) return status, msg
def put(self, study_id, file_name): # param validation if study_id is None or file_name is None: abort(406, 'Please provide valid parameters for study identifier and TSV file name') study_id = study_id.upper() try: data_dict = json.loads(request.data.decode('utf-8')) new_rows = data_dict['data'] # Use "index:n" element, this is the original row number except KeyError: new_rows = None if new_rows is None: abort(404, "Please provide valid data for updated new row(s). " "The JSON string has to have a 'data' element") for row in new_rows: try: row_index = row['index'] # Check if we have a value in the row number(s) except (KeyError, Exception): row_index = None if new_rows is None or row_index is None: abort(404, "Please provide valid data for the updated row(s). " "The JSON string has to have an 'index:n' element in each (JSON) row. " "The header row can not be updated") # User authentication user_token = None if "user_token" in request.headers: user_token = request.headers["user_token"] # check for access rights is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = wsc.get_permissions(study_id, user_token) if not write_access: abort(403) file_name = os.path.join(study_location, file_name) try: file_df = read_tsv(file_name) except FileNotFoundError: abort(400, "The file " + file_name + " was not found") for row in new_rows: try: row_index_int = int(row['index']) except: row_index_int is None # Validate column names in new rows valid_column_name, message = validate_row(file_df, row, 'put') if not valid_column_name: abort(417, message) if row_index_int is not None: file_df = file_df.drop(file_df.index[row_index_int]) # Remove the old row from the spreadsheet # pop the "index:n" from the new_row before updating row.pop('index', None) # Remove "index:n" element, this is the original row number file_df = insert_row(row_index_int, file_df, row) # Update the row in the spreadsheet message = write_tsv(file_df, file_name) df_data_dict = totuples(file_df.reset_index(), 'rows') # Get an indexed header row df_header = get_table_header(file_df) return {'header': df_header, 'data': df_data_dict, 'message': message}