Example #1
0
def copy(rawdata_directory):
    #Find name of GLDS number
    GLDS = os.path.basename(os.path.dirname(rawdata_directory))
    rawdata_out = os.path.join(config.outdir,GLDS,'microarray')

    #Make appropriate output directory
    if not os.path.exists(rawdata_out):
        os.makedirs(rawdata_out)

    #Now search the microarray folder for raw data files (this part could be done in a smarter way...)
    for file1 in os.listdir(rawdata_directory):
        if 'raw' in file1 or 'RAW' in file1 or 'Raw' in file1 or 'CEL' in file1 or not 'processed' in file1:
            out_file_path = os.path.join(rawdata_out,file1)

            #Command for copying the raw files to desired output
            cp_command = ["cp", os.path.join(rawdata_directory,file1),out_file_path]

            #md5sum command to check original files
            config.get_md5sum(cp_command[1],'original',action='copy')

            #Then execute the copy command to copy raw files to output directory
            subprocess.call(cp_command)

            #md5sum command to check copied files
            config.get_md5sum(cp_command[2],'new')

            #Once copied, unzip/untar/gunzip compressed directories (if there are any)
            if '.zip' in file1:
                unzip_command = ["unzip", "-o", "-qq",out_file_path,"-d",rawdata_out]
                subprocess.call(unzip_command)
                remove_command = ["rm", out_file_path]
                subprocess.call(remove_command)
            if '.gz' in file1:
                gunzip_command = ["gunzip", "-f",out_file_path]
                subprocess.call(gunzip_command)
            if '.tar' in file1:
                untar_command = ["tar", "-xf", out_file_path, "-C", rawdata_out]
                subprocess.call(untar_command)
                remove_command = ["rm", out_file_path]
                subprocess.call(remove_command)

    #Sometimes compressed files spit out more compressed files so loop through the files once again to catch those and uncompress them
    for file2 in os.listdir(rawdata_out):
        out_file_path = os.path.join(rawdata_out,file2)
        if '.zip' in file2:
            unzip_command = ["unzip", "-o", "-qq", out_file_path, "-d", rawdata_out]
            subprocess.call(unzip_command)
            remove_command = ["rm", out_file_path]
            subprocess.call(remove_command)
        if '.gz' in file2:
            gunzip_command = ["gunzip", "-f", out_file_path]
            subprocess.call(gunzip_command)
        if '.tar' in file2:
            untar_command = ["tar", "-xf", out_file_path, "-C", rawdata_out]
            subprocess.call(untar_command)
            remove_command = ["rm", out_file_path]
            subprocess.call(remove_command)
Example #2
0
def rename(GLDS_path):
    #First get all the proper paths according to specifications
    metadata_out = os.path.join(GLDS_path,'metadata')
    rawdata_out = os.path.join(GLDS_path,'microarray')
    GLDS = os.path.basename(GLDS_path)
    assay_dict = metadata_process.read_assay(metadata_out)
    final_rawdata_out = os.path.join(rawdata_out,'raw_files')

    #Make the 'raw_files' directory if it doesn't already exist
    if not os.path.isdir(final_rawdata_out):
        os.makedirs(final_rawdata_out)

    #Loop through the raw data files 
    for filename in os.listdir(rawdata_out):
        if not os.path.isdir(os.path.join(rawdata_out,filename)):

            #Boolean to detect whether the first column corresponds well to the filenames
            sample_in_first_column = False
            sample_in_other_column = False
            extension = filename.split('.')[-1]
            for key in assay_dict:

                #If the first column does correspond well, then assume the first column is the sample name and rename accordingly
                if key in filename:
                    sample_in_first_column = True
                    sample_name = key.replace(' ','-').replace('_','-').replace('(','-').replace(')','-').strip('-')
                    move_command = ["mv", os.path.join(rawdata_out,filename), os.path.join(final_rawdata_out,GLDS+'_'+sample_name+'_microarray_raw.'+extension)]
                    new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+sample_name+'_microarray_raw.'+extension)

            #If the first column does not correspond to any filenames, simply remove '_', '(', and ')' characters from filename and append appropriate naming conventions
            if not sample_in_first_column:
                for key in assay_dict:
                    for item in assay_dict[key]:
                        if item in filename and item != '':
                            sample_in_other_column = True
                            new_filename = key.split('.')[0].replace('_','-').replace('(','-').replace(')','-').replace(' ','-').strip('-')
                            move_command = ["mv", os.path.join(rawdata_out,filename), os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_raw.'+extension)]
                            new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_raw.'+extension)

            #Execute the command if the file was in metadata - catch whether the file already exists and don't output an error
            if sample_in_first_column or sample_in_other_column:
                try:
                    config.get_md5sum(move_command[1],'original',action='rename')
                    with open(os.devnull,'w') as FNULL:
                        subprocess.check_call(move_command,stdout=FNULL, stderr=subprocess.STDOUT)
                    config.get_md5sum(new_md5sum_file,'new')
                except subprocess.CalledProcessError:
                    config.md5sum['new'].append(('Move Error','N/A'))
            # elif not os.path.isdir(os.path.join(rawdata_out,filename)):
            #     remove_command = ["rm",os.path.join(rawdata_out,filename)]
            #     config.get_md5sum(os.path.join(rawdata_out,filename),'original',action='remove')
            #     subprocess.call(remove_command)
            #     config.md5sum['new'].append(('Removed','N/A'))
    metadata_process.modify_assay(metadata_out,GLDS,extension)
Example #3
0
def copy(rawdata_directory):
    #Find name of GLDS number
    GLDS = os.path.basename(os.path.dirname(rawdata_directory))
    rawdata_out = os.path.join(config.outdir,GLDS,config.microarray_out)

    #Make appropriate output directory
    if not os.path.exists(rawdata_out):
        os.makedirs(rawdata_out)


    #Now search the microarray folder for raw data files (this part could be done in a smarter way...)
    for file1 in os.listdir(rawdata_directory):
        out_file_path = os.path.join(rawdata_out,file1)

        #Command for copying the raw files to desired output
        cp_command = ["cp", os.path.join(rawdata_directory,file1),out_file_path]

        #md5sum command to check original files
        config.get_md5sum(cp_command[1],'original',action='copy')

        #Then execute the copy command to copy raw files to output directory
        subprocess.call(cp_command)

        #md5sum command to check copied files
        config.get_md5sum(cp_command[2],'new')

        #Once copied, unzip/untar/gunzip compressed directories (if there are any)
        unzip(out_file_path,rawdata_out)


    #Sometimes compressed files spit out more compressed files so loop through the files once again to catch those and uncompress them
    for file2 in os.listdir(rawdata_out):
        out_file_path = os.path.join(rawdata_out,file2)
        unzip(out_file_path,rawdata_out)

    #Sometimes compressed files spit out more compressed files so loop through the files once again to catch those and uncompress them
    for file2 in os.listdir(rawdata_out):
        out_file_path = os.path.join(rawdata_out,file2)
        unzip(out_file_path,rawdata_out)
def clean(metadata_directory):
    #Path to the directory (absolute)
    dirpath = metadata_directory

    #Get all entries in the directory w/ stats
    entries = (os.path.join(dirpath, fn) for fn in os.listdir(dirpath))
    entries = ((os.stat(path), path) for path in entries)

    #Insert creation date so we can get the last modified metadata
    entries = ((stat[ST_MTIME], path) for stat, path in entries)

    #Find name of GLDS number
    GLDS = os.path.basename(os.path.dirname(metadata_directory))
    #metadata_out is the path to the output metadata
    metadata_out = os.path.join(config.outdir,GLDS,'metadata')

    #Make appropriate output directory
    if not os.path.exists(metadata_out):
        os.makedirs(metadata_out)

    #Get last modified metadata zip file, copy to the output directory, unzip it, remove the zipped directory, and finally bring all files within folders to the top metadata directory
    i = 0
    for cdate, path in sorted(entries,reverse=True):
        if 'zip' in path and i == 0:
            metadata_zip = os.path.join(metadata_directory,os.path.basename(path))
            zip_filename = os.path.basename(metadata_zip)

            #Check md5sum of original zip file
            # md5sum_command = ["md5sum",metadata_zip]
            # original_md5sum = subprocess.check_output(md5sum_command).split(' ')[0].encode("utf-8")
            # config.md5sum["original"].append((zip_filename,original_md5sum))
            config.get_md5sum(metadata_zip,'original',action='copy')

            #Copy the last modified metadata
            cp_command = ["cp","-r",metadata_zip,metadata_out]
            #Unzip it into the metadata_out directory
            unzip_command = ["unzip", "-o", "-qq", os.path.join(metadata_out,zip_filename), "-d", metadata_out]
            #Remove the .zip compressed file to avoid confusion and save space
            remove_zip_command = ["rm",os.path.join(metadata_out,zip_filename)]

            #Execute copy command
            subprocess.call(cp_command)
            subprocess.call(unzip_command)

            #Verify md5sum for 'new' file
            # md5sum_command = ["md5sum",os.path.join(metadata_out,zip_filename)]
            # new_md5sum = subprocess.check_output(md5sum_command).split(' ')[0].encode("utf-8")
            # config.md5sum["new"].append((zip_filename,new_md5sum))
            config.get_md5sum(os.path.join(metadata_out,zip_filename),'new')

            #Execute unzipping and zip removal commands
            subprocess.call(remove_zip_command)

            i += 1

    #Loop through the metadata_out directory in case the unzipping produces a folder. If so, mv contents of folder up one directory and remove folder
    for filename in os.listdir(metadata_out):
        if os.path.isdir(os.path.join(metadata_out,filename)):
            move_command = ["mv", os.path.join(metadata_out,filename,"*"), metadata_out]

            #This is needed because the subprocess command cannot inherently deal with wildcards...
            shell_move_command =  ' '.join(move_command)

            remove_folder_command = ["rm", "-r",os.path.join(metadata_out,filename)]
            subprocess.call(shell_move_command, shell=True)
            subprocess.call(remove_folder_command)

    #Rename all metadata files to a standard naming convention
    for filename in os.listdir(metadata_out):
        config.get_md5sum(os.path.join(metadata_out,filename),'original',action='rename')
        isa = filename.split('_')[0]
        newfilename = isa + '_' + GLDS + '_microarray_metadata.txt'
        move_command = ["mv", os.path.join(metadata_out,filename),os.path.join(metadata_out,newfilename)]
        subprocess.call(move_command)
        config.get_md5sum(os.path.join(metadata_out,newfilename),'new')

    #Modify the investigation file to account for sample and assay renaming
    modify_i(GLDS,os.path.join(metadata_out,'i_' + GLDS + '_microarray_metadata.txt'))
def rename(GLDS_path):
    #First get all the proper paths according to specifications
    metadata_out = os.path.join(GLDS_path,'metadata')
    rawdata_out = os.path.join(GLDS_path,config.microarray_out)
    GLDS = os.path.basename(GLDS_path)
    assay_dict = metadata_process.read_assay(metadata_out)
    final_rawdata_out = os.path.join(rawdata_out,'raw_files')
    extension = ''

    #Make the 'raw_files' directory if it doesn't already exist
    if not os.path.isdir(final_rawdata_out):
        os.makedirs(final_rawdata_out)

    #Loop through the raw data files 
    for filename in os.listdir(rawdata_out):
        if not os.path.isdir(os.path.join(rawdata_out,filename)):

            #Boolean to detect whether the first column corresponds well to the filenames
            sample_in_first_column = False
            sample_in_other_column = False
            extension = filename.split('.')[-1]

            #If the filename is an annotation type, don't include 'raw' in filename
            if '.adf.' in filename:
                new_filename = filename.replace('_','-').replace('(','-').replace(')','-').replace(' ','-').replace(GLDS,'').replace('microarray','').replace('--','-').replace('.adf','-adf').strip('-').split('.')[0]
                move_command = ["mv", "'"+os.path.join(rawdata_out,filename)+"'", os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_annotation.adf.'+extension)]
                new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_annotation.adf.'+extension)
            elif 'GPL' in filename:
                new_filename = filename.replace('_','-').replace('(','-').replace(')','-').replace(' ','-').replace(GLDS,'').replace('microarray','').replace('--','-').replace('.adf','-adf').strip('-').split('.')[0]
                move_command = ["mv", "'"+os.path.join(rawdata_out,filename)+"'", os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_annotation.'+extension)]
                new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_annotation.'+extension)
                config.GPL=True
            else:
                for key in assay_dict:
                    #If the first column does correspond well, then assume the first column is the sample name and rename accordingly
                    if key in filename:
                        sample_in_first_column = True
                        sample_name = key.replace(' ','-').replace('_','-').replace('(','-').replace(')','-').replace(GLDS,'').replace('microarray','').replace('--','-').strip('-')
                        move_command = ["mv", "'"+os.path.join(rawdata_out,filename)+"'", os.path.join(final_rawdata_out,GLDS+'_'+sample_name+'_microarray_raw.'+extension)]
                        new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+sample_name+'_microarray_raw.'+extension)

                #If the first column does not correspond to any filenames, look in other columns. Still rename as first column without special characters
                if not sample_in_first_column:
                    for key in assay_dict:
                        for item in assay_dict[key]:
                            if item == filename and item != '':
                                sample_in_other_column = True
                                new_filename = key.replace('_','-').replace('(','-').replace(')','-').replace(' ','-').replace(GLDS,'').replace('microarray','').replace('--','-').strip('-')
                                move_command = ["mv","'"+os.path.join(rawdata_out,filename)+"'", os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_raw.'+extension)]
                                new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_raw.'+extension)

                #If the filename isn't in the metadata, just remove special characters and append appropriate information. Also, don't consider it a 'raw' file
                if not sample_in_first_column and not sample_in_other_column:
                    new_filename = filename.replace('_','-').replace('(','-').replace(')','-').replace(' ','-').replace(GLDS,'').replace('microarray','').replace('--','-').strip('-').split('.')[0]
                    move_command = ["mv","'"+os.path.join(rawdata_out,filename)+"'", os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_other.'+extension)]
                    new_md5sum_file = os.path.join(final_rawdata_out,GLDS+'_'+new_filename+'_microarray_other.'+extension)

            #Execute the command if the file was in metadata - catch whether the file already exists and don't output an error
            try:
                config.get_md5sum(move_command[1].strip("'"),'original',action='rename')
                with open(os.devnull,'w') as FNULL:
                    subprocess.check_call(' '.join(move_command),shell=True,stdout=FNULL, stderr=subprocess.STDOUT)
                config.get_md5sum(new_md5sum_file,'new')
            except subprocess.CalledProcessError:
                config.md5sum['new'].append(('Move Error',' '.join(move_command)))

    #Add appropriate columns and filenames to the assay file in ISA metadata
    if len(extension) != 0:
        metadata_process.modify_assay(metadata_out,GLDS,extension)