def run(self, network, antecedents, out_attributes, user_options, num_cores, outpath): """extract the files that are gpr format""" import os import shutil from Betsy import gpr_module from Betsy import module_utils directory = module_utils.unzip_if_zip(antecedents.identifier) x = os.listdir(directory) x = [x for x in x if x != ".DS_Store"] files = x assert files, 'The input folder or zip file is empty.' files = [ x for x in files if gpr_module.check_gpr(os.path.join(directory, x)) ] assert files, 'There are no gpr files in the input.' if not os.path.exists(outpath): os.mkdir(outpath) for file_ in files: x1 = os.path.join(directory, file_) x2 = os.path.join(outpath, file_) shutil.copyfile(x1, x2)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): """extract the fastq rna seq files""" import os import shutil from genomicode import filelib from Betsy import module_utils in_data = antecedents directory = module_utils.unzip_if_zip(in_data.identifier) filenames = os.listdir(directory) assert filenames, 'The input folder or zip file is empty.' if not os.path.exists(outfile): os.mkdir(outfile) format_types = ['fa', 'fastq'] for format_type in format_types: for filename in filenames: if filename == '.DS_Store': continue fileloc = os.path.join(in_data.identifier, filename) if fileloc.endswith(format_type + '.gz'): newfname = os.path.splitext(filename)[0] new_file = module_utils.gunzip(fileloc) elif fileloc.endswith(format_type): new_file = fileloc newfname = filename shutil.copyfile(new_file, os.path.join(outfile, newfname)) if fileloc.endswith('.gz'): os.remove(new_file) assert filelib.exists_nz(outfile), ( 'the output file %s for extract_rna_files_fastq fails' % outfile)
def set_out_attributes(self, antecedents, out_attributes): import os import shutil from Betsy import module_utils directory = module_utils.unzip_if_zip(antecedents.identifier) filenames = os.listdir(directory) if directory != antecedents.identifier: shutil.rmtree(directory) assert filenames, 'The input folder or zip file is empty.' format_types = ['fa', 'fastq'] flag = [] for format_type in format_types: for filename in filenames: if filename == '.DS_Store': continue if filename.endswith(format_type + '.gz'): flag.append(True) elif filename.endswith(format_type): flag.append(True) else: flag.append(False) if True in flag: out_attributes['format_type'] = 'fastqfolder' return out_attributes out_attributes['format_type'] = 'not_fastqfolder' return out_attributes
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess from Betsy import module_utils from genomicode import config in_data = antecedents directory = module_utils.unzip_if_zip(in_data.identifier) filenames = os.listdir(directory) assert filenames, 'The input folder or zip file is empty.' if not os.path.exists(outfile): os.mkdir(outfile) samtools_BIN = config.samtools assert os.path.exists( samtools_BIN), 'cannot find the %s' % samtools_BIN for filename in filenames: infile = os.path.join(directory, filename) outname = os.path.splitext(filename)[-2] + '_sorted.bam' outname = os.path.join(outfile, outname) command = [samtools_BIN, 'sort', infile, outname] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() error_message = process.communicate() if 'error' in error_message[1]: raise ValueError(error_message) assert module_utils.exists_nz(outname), ( 'the output file %s for sort_bam_folder does not exist' % outname)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): """extract the cel files with cc or v3_4""" import os import shutil from Betsy import module_utils from genomicode import affyio from genomicode import filelib in_data = antecedents directory = module_utils.unzip_if_zip(in_data.identifier) filenames = os.listdir(directory) assert filenames, 'The input folder or zip file is empty.' ver_list = [] if not os.path.exists(outfile): os.mkdir(outfile) for filename in filenames: if filename == '.DS_Store': pass else: fileloc = os.path.join(directory, filename) cel_v = affyio.guess_cel_version(fileloc) if cel_v in ['cc1', 'v3', 'v4']: shutil.copyfile(fileloc, os.path.join(outfile, filename)) ver_list.append(True) else: ver_list.append(False) if True in ver_list: assert filelib.exists_nz(outfile), ( 'the output file %s for extract_CEL_files fails' % outfile) else: assert ValueError('There is no cel file in the input.')
def set_out_attributes(self, antecedents, out_attributes): import os import shutil from Betsy import module_utils in_data = antecedents directory = module_utils.unzip_if_zip(in_data.identifier) filenames = os.listdir(directory) if directory != antecedents.identifier: shutil.rmtree(directory) assert filenames, 'The input folder or zip file is empty.' format_type = 'sam' flag = [] for filename in filenames: if filename == '.DS_Store': continue fileloc = os.path.join(in_data.identifier, filename) if fileloc.endswith(format_type + '.gz'): flag.append(True) elif fileloc.endswith(format_type): flag.append(True) else: flag.append(False) if True in flag: out_attributes['format_type'] = 'samfolder' return out_attributes out_attributes['format_type'] = 'not_samfolder' return out_attributes
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import shutil from Betsy import module_utils from genomicode import filelib in_data = antecedents directory = module_utils.unzip_if_zip(in_data.identifier) agilent_files = [] filenames = os.listdir(directory) assert filenames, 'The input folder or zip file is empty.' for filename in filenames: if filename in ['.DS_Store', '._.DS_Store', '.Rapp.history']: continue if os.path.isdir(os.path.join(directory, filename)): continue postag = [] fline = [] f = open(os.path.join(directory, filename), 'r') for i in range(10): line = f.readline() words = line.split() if len(words) > 0: postag.append(words[0]) if words[0] == 'FEATURES': fline = set(words) f.close() signal_tag = set(['gProcessedSignal', 'rProcessedSignal']) if signal_tag.issubset(fline): if postag == [ 'TYPE', 'FEPARAMS', 'DATA', '*', 'TYPE', 'STATS', 'DATA', '*', 'TYPE', 'FEATURES' ]: agilent_files.append(filename) if agilent_files: if not os.path.exists(outfile): os.mkdir(outfile) for filename in agilent_files: old_file = os.path.join(directory, filename) new_file = os.path.join(outfile, filename) shutil.copyfile(old_file, new_file) assert filelib.exists_nz(outfile), ( 'the output file %s for extract_agilent_files fails' % outfile) else: raise ValueError('There is no agilent file in the input.')
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): """extract the matrix file from the expression files""" import os from Betsy import module_utils from genomicode import filelib #from genomicode import affyio in_data = antecedents directory = module_utils.unzip_if_zip(in_data.identifier) filenames = os.listdir(directory) assert filenames, 'The input folder or zip file is empty.' for filename in filenames: if 'series_matrix.txt' in filename: fileloc = os.path.join(directory, filename) outname = os.path.join(outfile, filename) extract_expression_file(fileloc, outname) assert filelib.exists_nz(outfile), ( 'the output file %s for extract_matrix_file fails' % outfile)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import shutil from Betsy import module_utils in_data = antecedents out_path = outfile if not os.path.exists(out_path): os.mkdir(out_path) in_path = module_utils.unzip_if_zip(in_data.identifier) assert in_path == in_data.identifier filenames = os.listdir(in_path) assert filenames, "The input folder or zip file is empty." x = guess_datatype(in_path) datatype, filenames = x for in_filename in filenames: in_path, in_file = os.path.split(in_filename) out_filename = os.path.join(out_path, in_file) shutil.copyfile(in_filename, out_filename)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os import shutil from genomicode import filelib from Betsy import module_utils path = module_utils.unzip_if_zip(in_data.identifier) x = filelib.list_files_in_path(path) x = [x for x in x if x.lower().endswith(".idat")] assert x, "No idat files." in_filenames = x if not os.path.exists(out_path): os.mkdir(out_path) for in_filename in in_filenames: in_path, in_file = os.path.split(in_filename) file_, ext = os.path.splitext(in_file) if file_.endswith("_Grn"): file_ = file_[:-4] out_file = "%s%s" % (file_, ext) out_filename = os.path.join(out_path, out_file) shutil.copyfile(in_filename, out_filename)
def run(self, network, antecedents, out_attributes, user_options, num_cores, outfile): import os import subprocess from Betsy import module_utils from genomicode import config in_data = antecedents directory = module_utils.unzip_if_zip(in_data.identifier) filenames = os.listdir(directory) assert filenames, 'The input folder or zip file is empty.' if not os.path.exists(outfile): os.mkdir(outfile) mark_duplicates_path = config.Mark_duplicates assert os.path.exists( mark_duplicates_path), 'cannot find the %s' % mark_duplicates_path for filename in filenames: infile = os.path.join(directory, filename) outname = os.path.splitext(filename)[-2] + '.bam' outname = os.path.join(outfile, outname) command = [ 'java', '-Xmx5g', '-jar', mark_duplicates_path, 'I=' + infile, 'O=' + outname, 'METRICS_FILE=metricsFile', 'VALIDATION_STRINGENCY=LENIENT', 'REMOVE_DUPLICATES=true' ] process = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() error_message = process.communicate() if 'error' in error_message[1]: raise ValueError(error_message) assert module_utils.exists_nz(outname), ( 'the output file %s for flag_dups_in_bam_folder does not exist' % outname)