def copy(rawdata_directory): #Find name of GLDS number GLDS = os.path.basename(os.path.dirname(rawdata_directory)) rawdata_out = os.path.join(config.outdir,GLDS,'microarray') #Make appropriate output directory if not os.path.exists(rawdata_out): os.makedirs(rawdata_out) #Now search the microarray folder for raw data files (this part could be done in a smarter way...) for file1 in os.listdir(rawdata_directory): if 'raw' in file1 or 'RAW' in file1 or 'Raw' in file1 or 'CEL' in file1 or not 'processed' in file1: out_file_path = os.path.join(rawdata_out,file1) #Command for copying the raw files to desired output cp_command = ["cp", os.path.join(rawdata_directory,file1),out_file_path] #md5sum command to check original files config.get_md5sum(cp_command[1],'original',action='copy') #Then execute the copy command to copy raw files to output directory subprocess.call(cp_command) #md5sum command to check copied files config.get_md5sum(cp_command[2],'new') #Once copied, unzip/untar/gunzip compressed directories (if there are any) if '.zip' in file1: unzip_command = ["unzip", "-o", "-qq",out_file_path,"-d",rawdata_out] subprocess.call(unzip_command) remove_command = ["rm", out_file_path] subprocess.call(remove_command) if '.gz' in file1: gunzip_command = ["gunzip", "-f",out_file_path] subprocess.call(gunzip_command) if '.tar' in file1: untar_command = ["tar", "-xf", out_file_path, "-C", rawdata_out] subprocess.call(untar_command) remove_command = ["rm", out_file_path] subprocess.call(remove_command) #Sometimes compressed files spit out more compressed files so loop through the files once again to catch those and uncompress them for file2 in os.listdir(rawdata_out): out_file_path = os.path.join(rawdata_out,file2) if '.zip' in file2: unzip_command = ["unzip", "-o", "-qq", out_file_path, "-d", rawdata_out] subprocess.call(unzip_command) remove_command = ["rm", out_file_path] subprocess.call(remove_command) if '.gz' in file2: gunzip_command = ["gunzip", "-f", out_file_path] subprocess.call(gunzip_command) if '.tar' in file2: untar_command = ["tar", "-xf", out_file_path, "-C", rawdata_out] subprocess.call(untar_command) remove_command = ["rm", out_file_path] subprocess.call(remove_command)
def rename(GLDS_path): #First get all the proper paths according to specifications metadata_out = os.path.join(GLDS_path,'metadata') rawdata_out = os.path.join(GLDS_path,'microarray') GLDS = os.path.basename(GLDS_path) assay_dict = metadata_process.read_assay(metadata_out) final_rawdata_out = os.path.join(rawdata_out,'raw_files') #Make the 'raw_files' directory if it doesn't already exist if not os.path.isdir(final_rawdata_out): os.makedirs(final_rawdata_out) #Loop through the raw data files for filename in os.listdir(rawdata_out): if not os.path.isdir(os.path.join(rawdata_out,filename)): #Boolean to detect whether the first column corresponds well to the filenames sample_in_first_column = False sample_in_other_column = False extension = filename.split('.')[-1] for key in assay_dict: #If the first column does correspond well, then assume the first column is the sample name and rename accordingly if key in filename: sample_in_first_column = True sample_name = key.replace(' ','-').replace('_','-').replace('(','-').replace(')','-').strip('-') move_command = ["mv", os.path.join(rawdata_out,filename), os.path.join(final_rawdata_out,GLDS+'_'+sample_name+'_microarray_raw.'+extension)] new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+sample_name+'_microarray_raw.'+extension) #If the first column does not correspond to any filenames, simply remove '_', '(', and ')' characters from filename and append appropriate naming conventions if not sample_in_first_column: for key in assay_dict: for item in assay_dict[key]: if item in filename and item != '': sample_in_other_column = True new_filename = key.split('.')[0].replace('_','-').replace('(','-').replace(')','-').replace(' ','-').strip('-') move_command = ["mv", os.path.join(rawdata_out,filename), os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_raw.'+extension)] new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_raw.'+extension) #Execute the command if the file was in metadata - catch whether the file already exists and don't output an error if sample_in_first_column or sample_in_other_column: try: config.get_md5sum(move_command[1],'original',action='rename') with open(os.devnull,'w') as FNULL: subprocess.check_call(move_command,stdout=FNULL, stderr=subprocess.STDOUT) config.get_md5sum(new_md5sum_file,'new') except subprocess.CalledProcessError: config.md5sum['new'].append(('Move Error','N/A')) # elif not os.path.isdir(os.path.join(rawdata_out,filename)): # remove_command = ["rm",os.path.join(rawdata_out,filename)] # config.get_md5sum(os.path.join(rawdata_out,filename),'original',action='remove') # subprocess.call(remove_command) # config.md5sum['new'].append(('Removed','N/A')) metadata_process.modify_assay(metadata_out,GLDS,extension)
def copy(rawdata_directory): #Find name of GLDS number GLDS = os.path.basename(os.path.dirname(rawdata_directory)) rawdata_out = os.path.join(config.outdir,GLDS,config.microarray_out) #Make appropriate output directory if not os.path.exists(rawdata_out): os.makedirs(rawdata_out) #Now search the microarray folder for raw data files (this part could be done in a smarter way...) for file1 in os.listdir(rawdata_directory): out_file_path = os.path.join(rawdata_out,file1) #Command for copying the raw files to desired output cp_command = ["cp", os.path.join(rawdata_directory,file1),out_file_path] #md5sum command to check original files config.get_md5sum(cp_command[1],'original',action='copy') #Then execute the copy command to copy raw files to output directory subprocess.call(cp_command) #md5sum command to check copied files config.get_md5sum(cp_command[2],'new') #Once copied, unzip/untar/gunzip compressed directories (if there are any) unzip(out_file_path,rawdata_out) #Sometimes compressed files spit out more compressed files so loop through the files once again to catch those and uncompress them for file2 in os.listdir(rawdata_out): out_file_path = os.path.join(rawdata_out,file2) unzip(out_file_path,rawdata_out) #Sometimes compressed files spit out more compressed files so loop through the files once again to catch those and uncompress them for file2 in os.listdir(rawdata_out): out_file_path = os.path.join(rawdata_out,file2) unzip(out_file_path,rawdata_out)
def clean(metadata_directory): #Path to the directory (absolute) dirpath = metadata_directory #Get all entries in the directory w/ stats entries = (os.path.join(dirpath, fn) for fn in os.listdir(dirpath)) entries = ((os.stat(path), path) for path in entries) #Insert creation date so we can get the last modified metadata entries = ((stat[ST_MTIME], path) for stat, path in entries) #Find name of GLDS number GLDS = os.path.basename(os.path.dirname(metadata_directory)) #metadata_out is the path to the output metadata metadata_out = os.path.join(config.outdir,GLDS,'metadata') #Make appropriate output directory if not os.path.exists(metadata_out): os.makedirs(metadata_out) #Get last modified metadata zip file, copy to the output directory, unzip it, remove the zipped directory, and finally bring all files within folders to the top metadata directory i = 0 for cdate, path in sorted(entries,reverse=True): if 'zip' in path and i == 0: metadata_zip = os.path.join(metadata_directory,os.path.basename(path)) zip_filename = os.path.basename(metadata_zip) #Check md5sum of original zip file # md5sum_command = ["md5sum",metadata_zip] # original_md5sum = subprocess.check_output(md5sum_command).split(' ')[0].encode("utf-8") # config.md5sum["original"].append((zip_filename,original_md5sum)) config.get_md5sum(metadata_zip,'original',action='copy') #Copy the last modified metadata cp_command = ["cp","-r",metadata_zip,metadata_out] #Unzip it into the metadata_out directory unzip_command = ["unzip", "-o", "-qq", os.path.join(metadata_out,zip_filename), "-d", metadata_out] #Remove the .zip compressed file to avoid confusion and save space remove_zip_command = ["rm",os.path.join(metadata_out,zip_filename)] #Execute copy command subprocess.call(cp_command) subprocess.call(unzip_command) #Verify md5sum for 'new' file # md5sum_command = ["md5sum",os.path.join(metadata_out,zip_filename)] # new_md5sum = subprocess.check_output(md5sum_command).split(' ')[0].encode("utf-8") # config.md5sum["new"].append((zip_filename,new_md5sum)) config.get_md5sum(os.path.join(metadata_out,zip_filename),'new') #Execute unzipping and zip removal commands subprocess.call(remove_zip_command) i += 1 #Loop through the metadata_out directory in case the unzipping produces a folder. If so, mv contents of folder up one directory and remove folder for filename in os.listdir(metadata_out): if os.path.isdir(os.path.join(metadata_out,filename)): move_command = ["mv", os.path.join(metadata_out,filename,"*"), metadata_out] #This is needed because the subprocess command cannot inherently deal with wildcards... shell_move_command = ' '.join(move_command) remove_folder_command = ["rm", "-r",os.path.join(metadata_out,filename)] subprocess.call(shell_move_command, shell=True) subprocess.call(remove_folder_command) #Rename all metadata files to a standard naming convention for filename in os.listdir(metadata_out): config.get_md5sum(os.path.join(metadata_out,filename),'original',action='rename') isa = filename.split('_')[0] newfilename = isa + '_' + GLDS + '_microarray_metadata.txt' move_command = ["mv", os.path.join(metadata_out,filename),os.path.join(metadata_out,newfilename)] subprocess.call(move_command) config.get_md5sum(os.path.join(metadata_out,newfilename),'new') #Modify the investigation file to account for sample and assay renaming modify_i(GLDS,os.path.join(metadata_out,'i_' + GLDS + '_microarray_metadata.txt'))
def rename(GLDS_path): #First get all the proper paths according to specifications metadata_out = os.path.join(GLDS_path,'metadata') rawdata_out = os.path.join(GLDS_path,config.microarray_out) GLDS = os.path.basename(GLDS_path) assay_dict = metadata_process.read_assay(metadata_out) final_rawdata_out = os.path.join(rawdata_out,'raw_files') extension = '' #Make the 'raw_files' directory if it doesn't already exist if not os.path.isdir(final_rawdata_out): os.makedirs(final_rawdata_out) #Loop through the raw data files for filename in os.listdir(rawdata_out): if not os.path.isdir(os.path.join(rawdata_out,filename)): #Boolean to detect whether the first column corresponds well to the filenames sample_in_first_column = False sample_in_other_column = False extension = filename.split('.')[-1] #If the filename is an annotation type, don't include 'raw' in filename if '.adf.' in filename: new_filename = filename.replace('_','-').replace('(','-').replace(')','-').replace(' ','-').replace(GLDS,'').replace('microarray','').replace('--','-').replace('.adf','-adf').strip('-').split('.')[0] move_command = ["mv", "'"+os.path.join(rawdata_out,filename)+"'", os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_annotation.adf.'+extension)] new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_annotation.adf.'+extension) elif 'GPL' in filename: new_filename = filename.replace('_','-').replace('(','-').replace(')','-').replace(' ','-').replace(GLDS,'').replace('microarray','').replace('--','-').replace('.adf','-adf').strip('-').split('.')[0] move_command = ["mv", "'"+os.path.join(rawdata_out,filename)+"'", os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_annotation.'+extension)] new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_annotation.'+extension) config.GPL=True else: for key in assay_dict: #If the first column does correspond well, then assume the first column is the sample name and rename accordingly if key in filename: sample_in_first_column = True sample_name = key.replace(' ','-').replace('_','-').replace('(','-').replace(')','-').replace(GLDS,'').replace('microarray','').replace('--','-').strip('-') move_command = ["mv", "'"+os.path.join(rawdata_out,filename)+"'", os.path.join(final_rawdata_out,GLDS+'_'+sample_name+'_microarray_raw.'+extension)] new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+sample_name+'_microarray_raw.'+extension) #If the first column does not correspond to any filenames, look in other columns. Still rename as first column without special characters if not sample_in_first_column: for key in assay_dict: for item in assay_dict[key]: if item == filename and item != '': sample_in_other_column = True new_filename = key.replace('_','-').replace('(','-').replace(')','-').replace(' ','-').replace(GLDS,'').replace('microarray','').replace('--','-').strip('-') move_command = ["mv","'"+os.path.join(rawdata_out,filename)+"'", os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_raw.'+extension)] new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_raw.'+extension) #If the filename isn't in the metadata, just remove special characters and append appropriate information. Also, don't consider it a 'raw' file if not sample_in_first_column and not sample_in_other_column: new_filename = filename.replace('_','-').replace('(','-').replace(')','-').replace(' ','-').replace(GLDS,'').replace('microarray','').replace('--','-').strip('-').split('.')[0] move_command = ["mv","'"+os.path.join(rawdata_out,filename)+"'", os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_other.'+extension)] new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_other.'+extension) #Execute the command if the file was in metadata - catch whether the file already exists and don't output an error try: config.get_md5sum(move_command[1].strip("'"),'original',action='rename') with open(os.devnull,'w') as FNULL: subprocess.check_call(' '.join(move_command),shell=True,stdout=FNULL, stderr=subprocess.STDOUT) config.get_md5sum(new_md5sum_file,'new') except subprocess.CalledProcessError: config.md5sum['new'].append(('Move Error',' '.join(move_command))) #Add appropriate columns and filenames to the assay file in ISA metadata if len(extension) != 0: metadata_process.modify_assay(metadata_out,GLDS,extension)