def parse_sample(sampleList): total_number = 0 with open(sampleList, 'r') as f: sampleInfo = bundle() for line in f: total_number += 1 line = line.strip() field = line.split() sampleName = field[0] rg_LB = field[2] rg_ID = "{}-{}".format(sampleName, field[3]) fq_dir = field[-1].strip() fq1s = glob.glob("%s/*1.fq.gz" % fq_dir) fq1 = '' fq2 = '' if fq1s: fq1 = fq1s[0].strip() else: logger.error("fq1 under %s don't exists." % sampleName) exit(3) rg = "@RG\\tID:%s\\tPL:COMPLETE\\tLB:%s\\tSM:%s\\tCN:BGI" % ( rg_ID, rg_LB, sampleName) fq_lib_name = rg_ID if not sampleInfo.has_key(sampleName): sampleInfo[sampleName] = bundle() sample_lane_counter = 0 else: sample_lane_counter = len(sampleInfo[sampleName]) dataTag = 'data' + str(sample_lane_counter) if not sampleInfo[sampleName].has_key(dataTag): sampleInfo[sampleName][dataTag] = bundle() #find adp1 sampleInfo[sampleName][dataTag]['fq1'] = fq1 #find fq2 and adp2 fq2 = fq1 fq2 = fq2.replace("1.fq.gz", "2.fq.gz") if os.path.exists(fq2): sampleInfo[sampleName][dataTag]['fq2'] = fq2 else: logger.warning("%s of line: %d is SE data!" % (sampleName, total_number)) sampleInfo[sampleName][dataTag]['rg'] = rg #sampleInfo[sampleName][dataTag]['libname'] = fq_lib_name sampleInfo[sampleName][dataTag]['gender'] = 'male' return sampleInfo
def parse_sample(sampleList): total_number = 0 with open(sampleList,'r') as f: sampleInfo = bundle() for line in f: total_number += 1 line = line.strip() field = line.split() sampleName = field[0] rg_LB = field[2] rg_ID = "{}-{}".format(sampleName, field[3]) fq_dir = field[-1].strip() fq1s = glob.glob("%s/*1.fq.gz" % fq_dir) fq1 = '' fq2 = '' if fq1s: fq1 = fq1s[0].strip() else: logger.error("fq1 under %s don't exists." % sampleName) exit(3) rg = "@RG\\tID:%s\\tPL:COMPLETE\\tLB:%s\\tSM:%s\\tCN:BGI" % (rg_ID,rg_LB,sampleName) fq_lib_name = rg_ID if not sampleInfo.has_key(sampleName): sampleInfo[sampleName] = bundle() sample_lane_counter = 0 else: sample_lane_counter = len(sampleInfo[sampleName]) dataTag = 'data'+str(sample_lane_counter) if not sampleInfo[sampleName].has_key(dataTag): sampleInfo[sampleName][dataTag] = bundle() #find adp1 sampleInfo[sampleName][dataTag]['fq1'] = fq1 #find fq2 and adp2 fq2 = fq1 fq2 = fq2.replace("1.fq.gz", "2.fq.gz") if os.path.exists(fq2): sampleInfo[sampleName][dataTag]['fq2'] = fq2 else: logger.warning("%s of line: %d is SE data!" % (sampleName,total_number)) sampleInfo[sampleName][dataTag]['rg'] = rg #sampleInfo[sampleName][dataTag]['libname'] = fq_lib_name sampleInfo[sampleName][dataTag]['gender'] = 'male' return sampleInfo
def parse_sample(sampleList): with open(sampleList,'r') as f: sampleInfo = bundle() for line in f: line = line.strip() tmp = line.split() sampleInfo[tmp[0]] = tmp[1] return sampleInfo
def parse_sample(sampleList): with open(sampleList, 'r') as f: sampleInfo = bundle() for line in f: line = line.strip() tmp = line.split() sampleInfo[tmp[0]] = tmp[1] return sampleInfo
def parse_sample(sampleList): with open(sampleList, 'r') as f: sampleInfo = bundle() sample_lane_counter = 0 for line in f: line = line.strip() if line[0] == '#': continue if re.match(r"^\s*$", line): continue sampleName = '' m = re.match(r"^>(\S+)$", line) if m: sampleName = m.group(1) if not sampleInfo.has_key(sampleName): sampleInfo[sampleName] = bundle() sample_lane_counter = 0 else: sample_lane_counter += 1 dataTag = 'data' + str(sample_lane_counter) sampleInfo[sampleName][dataTag] = bundle() for info in f: info = info.strip() if info[0] == '#': continue if re.match(r"^\s*$", info): continue m2 = re.match(r"^(\S+)\s*=\s*(\S+)", info) if m2: sampleInfo[sampleName][dataTag][m2.group( 1)] = m2.group(2) if re.match(r"^>\s*$", info): break return sampleInfo
def parse_sample(sampleList): with open(sampleList,'r') as f: sampleInfo = bundle() sample_lane_counter = 0 for line in f: line = line.strip() if line[0] == '#': continue if re.match(r"^\s*$", line): continue sampleName = '' m = re.match(r"^>(\S+)$", line) if m: sampleName = m.group(1) if not sampleInfo.has_key(sampleName): sampleInfo[sampleName] = bundle() sample_lane_counter = 0 else: sample_lane_counter += 1 dataTag = 'data' + str(sample_lane_counter) sampleInfo[sampleName][dataTag] = bundle() for info in f: info = info.strip() if info[0] == '#': continue if re.match(r"^\s*$", info): continue m2 = re.match(r"^(\S+)\s*=\s*(\S+)",info) if m2: sampleInfo[sampleName][dataTag][m2.group(1)] = m2.group(2) if re.match(r"^>\s*$", info): break return sampleInfo
def parse(self, mode): self.config.info = bundle() self.config.info.female_counter = 0 self.config.info.male_counter = 0 self.config.sample = bundle() total_number = 0 male_total_number = 0 female_total_number = 0 fq_file_set = set() sampleInfo = self.sampleParser(mode) if mode == 3 or mode == 4: self.config.sample = sampleInfo if self.config.analysisList[0] == 'init': self.config.analysisList = self.config.analysisList[1:] if mode == 3: unavailableStep = ['filter', 'alignment'] for step in unavailableStep: if step in self.config.analysisList: logger.error("Cann't run this step (%s) in mode3" % step) exit(0) # printtime("WARNING: step %s is dropped in mode 3."% step) # self.config.analysisList.remove(step) elif mode == 4: unavailableStep = ['filter', 'alignment', 'rmdup', 'realignment', 'baserecal', 'genotype', 'bamSort'] for step in unavailableStep: if step in self.config.analysisList: logger.error("Cann't run this step (%s) in mode4" % step) exit(0) # printtime("WARNING: step %s is dropped in mode 4."% step) # self.config.analysisList.remove(step) else: fastq_necessary_property = ("id", "fq1", "rg") for sampleName in sampleInfo: rg_id_dict = bundle() sampleIsSE = bundle() self.config.sample[sampleName] = bundle(rg=bundle(), lane=bundle()) sampleGender = self.rectify_gender(self.check_gender(sampleInfo[sampleName], sampleName)) if sampleGender: if self.config.ref.gender_mode != 'normal': self.config.sample[sampleName].gender = sampleGender if sampleGender == 'male': self.config.info.male_counter += 1 else: self.config.info.female_counter += 1 else: self.config.sample[sampleName].gender = 'normal' else: self.config.sample[sampleName].gender = 'normal' pool = '' for dataTag in sampleInfo[sampleName]: if not pool: pool = sampleInfo[sampleName][dataTag].get('pool') if self.config.ref.gender_mode == 'both': gender = self.rectify_gender(sampleInfo[sampleName][dataTag].get('gender')) if gender == 'female': sampleInfo[sampleName][dataTag]['id'] = female_total_number female_total_number += 1 if gender == 'male': sampleInfo[sampleName][dataTag]['id'] = male_total_number male_total_number += 1 else: sampleInfo[sampleName][dataTag]['id'] = total_number total_number += 1 self.config.sample[sampleName]['rg'][dataTag] = sampleInfo[sampleName][dataTag]['rg'] # RG.ID 同一样本的RG.ID不能重复 rg_id = sampleInfo[sampleName][dataTag]['rg'].split('ID:')[1].split('\\t')[0] if not rg_id_dict.has_key(rg_id): rg_id_dict[rg_id] = True else: logger.error('The same RG.ID in the different data (%s) of %s' % (dataTag, sampleName)) # if not sampleInfo[sampleName][dataTag].has_key('gender'): # logger.error("No gender info in %s %s!" % (sampleName,dataTag)) for prop in fastq_necessary_property: if not sampleInfo[sampleName][dataTag].has_key(prop): logger.error( "fastq prperty: %s is not exists in the %s of %s. You must set it in your sample file." % ( prop, dataTag, sampleName)) if not os.path.exists(sampleInfo[sampleName][dataTag]['fq1']): raise RuntimeError("%s don't exists!" % sampleInfo[sampleName][dataTag]['fq1']) if sampleInfo[sampleName][dataTag].get('fq2') and sampleInfo[sampleName][dataTag]['fq2'] != 'null': if not os.path.exists(sampleInfo[sampleName][dataTag]['fq2']): raise RuntimeError("%s don't exists!" % sampleInfo[sampleName][dataTag]['fq2']) # fq file if sampleInfo[sampleName][dataTag]['fq2'] not in fq_file_set: fq_file_set.add(sampleInfo[sampleName][dataTag]['fq2']) else: raise RuntimeError("%s used more than once!" % sampleInfo[sampleName][dataTag]['fq2']) if sampleIsSE.has_key('isSE') and sampleIsSE['isSE'] == True: logger.error("%s: Have an error abort the prperty:fq2 in your sample file." % sampleName) sampleIsSE['isSE'] = False else: if sampleIsSE.has_key('isSE') and sampleIsSE['isSE'] == False: logger.error("%s: Have an error abort the prperty:fq2 in your sample file." % sampleName) sampleIsSE['isSE'] = True # fq file if sampleInfo[sampleName][dataTag]['fq1'] not in fq_file_set: fq_file_set.add(sampleInfo[sampleName][dataTag]['fq1']) else: raise RuntimeError("%s used more than once!" % sampleInfo[sampleName][dataTag]['fq1']) self.config.sample[sampleName]['lane'][dataTag] = sampleInfo[sampleName][dataTag] self.config.sample[sampleName].pool = pool self.config.init.isSE = sampleIsSE['isSE'] self.config.sample[sampleName].isSE = sampleIsSE['isSE'] return sampleInfo
class ParseSampleList(object): ''' This class is used to parse sample list ''' config = bundle() def __init__(self, sampleList, config): ''' Constructor ''' self.sampleList = sampleList self.config = config def rectify_gender(self, gender): if gender == 'F' or gender == 'female': return 'female' else: return 'male' def check_gender(self, sampleinfo, sampleName): sampleGender = '' for dataTag in sampleinfo: if not sampleGender: if sampleinfo[dataTag].get('gender'): sampleGender = sampleinfo[dataTag]['gender'] elif sampleGender != sampleinfo[dataTag]['gender']: logger.error("gender in %s is different in each lane!" % sampleName) return sampleGender def sampleParser(self, modetype): modname = 'mode' + str(modetype) parse_sample = getattr(search_mod(modname, self.config.Path.modeDir), 'parse_sample') sampleInfo = parse_sample(self.sampleList) return sampleInfo def parse(self, mode): self.config.info = bundle() self.config.info.female_counter = 0 self.config.info.male_counter = 0 self.config.sample = bundle() total_number = 0 male_total_number = 0 female_total_number = 0 fq_file_set = set() sampleInfo = self.sampleParser(mode) if mode == 3 or mode == 4: self.config.sample = sampleInfo if self.config.analysisList[0] == 'init': self.config.analysisList = self.config.analysisList[1:] if mode == 3: unavailableStep = ['filter', 'alignment'] for step in unavailableStep: if step in self.config.analysisList: logger.error("Cann't run this step (%s) in mode3" % step) exit(0) # printtime("WARNING: step %s is dropped in mode 3."% step) # self.config.analysisList.remove(step) elif mode == 4: unavailableStep = [ 'filter', 'alignment', 'rmdup', 'realignment', 'baserecal', 'genotype', 'bamSort' ] for step in unavailableStep: if step in self.config.analysisList: logger.error("Cann't run this step (%s) in mode4" % step) exit(0) # printtime("WARNING: step %s is dropped in mode 4."% step) # self.config.analysisList.remove(step) else: fastq_necessary_property = ("id", "fq1", "rg") for sampleName in sampleInfo: rg_id_dict = bundle() sampleIsSE = bundle() self.config.sample[sampleName] = bundle(rg=bundle(), lane=bundle()) sampleGender = self.rectify_gender( self.check_gender(sampleInfo[sampleName], sampleName)) if sampleGender: if self.config.ref.gender_mode != 'normal': self.config.sample[sampleName].gender = sampleGender if sampleGender == 'male': self.config.info.male_counter += 1 else: self.config.info.female_counter += 1 else: self.config.sample[sampleName].gender = 'normal' else: self.config.sample[sampleName].gender = 'normal' pool = '' for dataTag in sampleInfo[sampleName]: if not pool: pool = sampleInfo[sampleName][dataTag].get('pool') if self.config.ref.gender_mode == 'both': gender = self.rectify_gender( sampleInfo[sampleName][dataTag].get('gender')) if gender == 'female': sampleInfo[sampleName][dataTag][ 'id'] = female_total_number female_total_number += 1 if gender == 'male': sampleInfo[sampleName][dataTag][ 'id'] = male_total_number male_total_number += 1 else: sampleInfo[sampleName][dataTag]['id'] = total_number total_number += 1 self.config.sample[sampleName]['rg'][dataTag] = sampleInfo[ sampleName][dataTag]['rg'] # RG.ID 同一样本的RG.ID不能重复 rg_id = sampleInfo[sampleName][dataTag]['rg'].split( 'ID:')[1].split('\\t')[0] if not rg_id_dict.has_key(rg_id): rg_id_dict[rg_id] = True else: logger.error( 'The same RG.ID in the different data (%s) of %s' % (dataTag, sampleName)) # if not sampleInfo[sampleName][dataTag].has_key('gender'): # logger.error("No gender info in %s %s!" % (sampleName,dataTag)) for prop in fastq_necessary_property: if not sampleInfo[sampleName][dataTag].has_key(prop): logger.error( "fastq prperty: %s is not exists in the %s of %s. You must set it in your sample file." % (prop, dataTag, sampleName)) if not os.path.exists( sampleInfo[sampleName][dataTag]['fq1']): raise RuntimeError( "%s don't exists!" % sampleInfo[sampleName][dataTag]['fq1']) if sampleInfo[sampleName][dataTag].get( 'fq2' ) and sampleInfo[sampleName][dataTag]['fq2'] != 'null': if not os.path.exists( sampleInfo[sampleName][dataTag]['fq2']): raise RuntimeError( "%s don't exists!" % sampleInfo[sampleName][dataTag]['fq2']) # fq file if sampleInfo[sampleName][dataTag][ 'fq2'] not in fq_file_set: fq_file_set.add( sampleInfo[sampleName][dataTag]['fq2']) else: raise RuntimeError( "%s used more than once!" % sampleInfo[sampleName][dataTag]['fq2']) if sampleIsSE.has_key( 'isSE') and sampleIsSE['isSE'] == True: logger.error( "%s: Have an error abort the prperty:fq2 in your sample file." % sampleName) sampleIsSE['isSE'] = False else: if sampleIsSE.has_key( 'isSE') and sampleIsSE['isSE'] == False: logger.error( "%s: Have an error abort the prperty:fq2 in your sample file." % sampleName) sampleIsSE['isSE'] = True # fq file if sampleInfo[sampleName][dataTag][ 'fq1'] not in fq_file_set: fq_file_set.add(sampleInfo[sampleName][dataTag]['fq1']) else: raise RuntimeError( "%s used more than once!" % sampleInfo[sampleName][dataTag]['fq1']) self.config.sample[sampleName]['lane'][ dataTag] = sampleInfo[sampleName][dataTag] self.config.sample[sampleName].pool = pool self.config.init.isSE = sampleIsSE['isSE'] self.config.sample[sampleName].isSE = sampleIsSE['isSE'] return sampleInfo
def run(self, impl, sampleInfo): mode = self.option.mode result = bundle(output=bundle(), script=bundle()) # extend program path self.init.multiUploader = self.expath('init.multiUploader') self.init.gzUploader = self.expath('init.gzUploader') self.init.check_log = self.expath('init.check_log') self.init.bgzip = self.expath('init.bgzip', False) self.init.samtools = self.expath('init.samtools', False) sampleName = self.option.multiSampleName scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) self.analysisList = self.analysisList[1:] hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data', 'gz_tmp') tmp = impl.mkdir(self.option.workdir, "temp", sampleName, 'ubam') rawData = impl.mkdir(self.option.workdir, "ubam", sampleName) ubam = [] DataParam = [] output = bundle() cmd = [] for sample_name in sampleInfo.keys(): sample = sampleInfo[sample_name] output[sample_name] = bundle() for dataTag in sample.keys(): output[sample_name][dataTag] = bundle() filename = '{}_{}.bam'.format(sample_name, dataTag) output[sample_name][dataTag]['bam'] = os.path.join(rawData, filename) ubam.append(output[sample_name][dataTag]['bam']) DataParam.append({ "KEY1": sample[dataTag]['fq1'], "KEY2": sample[dataTag]['fq2'], "KEY3": output[sample_name][dataTag]['bam'], "KEY4": sample_name, "KEY5": sample_name + "_" + dataTag }) if DataParam: impl.write_file( fileName='data.list', scriptsdir=scriptsdir, commands=["${KEY1}\t${KEY2}\t${KEY3}\t${KEY4}\t${KEY5}"], JobParamList=DataParam) mapper = [] mapper.append("#!/usr/bin/perl -w") mapper.append("use strict;\n") mapper.append("while(<STDIN>)\n{") mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);") mapper.append("\tif(!-e $tmp[1])\n\t{") mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";") mapper.append("\t\texit 1;\n\t}") mapper.append("\tsystem(\"%s FastqToSam -F1 $tmp[1] -F2 $tmp[2] -O $tmp[3] -SM $tmp[4] -RG $tmp[5] --TMP_DIR %s -PL illumina\");\n}" % (self.init.gatk, tmp )) impl.write_file( fileName='upload_mapper.pl', scriptsdir=scriptsdir, commands=mapper) hadoop_parameter = ' -D mapred.job.name="upload data" ' if self.hadoop.get('queue'): hadoop_parameter += '-D mapreduce.job.queuename={} '.format(self.hadoop.queue) hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam) hadoop_parameter += '-D mapreduce.map.memory.mb=10240 ' hadoop_parameter += ' -D mapred.reduce.tasks=0 ' hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat ' ParamDict = { "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar), "MAPPER": os.path.join(scriptsdir, 'upload_mapper.pl'), "INPUT": 'file://' + os.path.join(scriptsdir, 'data.list'), "OUTPUT": hdfs_gz_tmp, "HADOOPPARAM": hadoop_parameter } cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete) cmd.append('${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"') # cmd.append('%s jar %s GzUploader -i %s -l' % ( # self.hadoop.bin, self.init.gzUploader, os.path.join(scriptsdir, 'data.list'))) # write script scriptPath = \ impl.write_shell( name='init', scriptsdir=scriptsdir, commands=cmd, paramDict=ParamDict) result.script[sampleName] = scriptPath result.output = output if self.init.qualitySystem == '': self.check_qs(sampleInfo) print "[INFO ] -- qualitySystem is %s (autocheck)--" % self.init.qualitySystem else: print "[INFO ] -- qualitySystem is %s --" % self.init.qualitySystem return result
def parse_sample(sampleList): total_number = 0 sampleInfo = bundle() with open(sampleList,'r') as f: for line in f: fq1s = [] line = line.strip() field = line.split() rg_LB = field[2] rg_PU = field[3] sampleName = field[1] if field[3].find(',') != -1: fq1s.append(field[3].split(',')[0]) rg_LB = field[1] rg_PU = field[2] sampleName = field[0] else: fq_dir = field[-1].strip() fq1s = glob.glob("%s/*1.fq.gz" % fq_dir) if not fq1s: fq1s = glob.glob("%s/*/*1.fq.gz" % fq_dir) if len(fq1s) == 0 or not os.path.exists(fq1s[0]): logger.error("fq1 under %s don't exists." % sampleName) exit(3) for fq1 in fq1s: total_number += 1 if not sampleInfo.has_key(sampleName): sampleInfo[sampleName] = bundle() sample_lane_counter = 0 else: sample_lane_counter = len(sampleInfo[sampleName]) # fq_name = os.path.basename(fq1) # fq_dir = os.path.abspath(os.path.dirname(fq1)) # slideID_laneID_barcode # CL100035764_L02_33_1.fq.gz # tmp = fq_name.split("_") # rg_PU = tmp[0] + "_" + tmp[1] + "_" + tmp[2] rg_ID = "{}_{}".format(sampleName, sample_lane_counter) rg = "@RG\\tID:%s\\tPL:COMPLETE\\tPU:%s\\tLB:%s\\tSM:%s\\tCN:BGI" % (rg_ID, rg_PU, rg_LB, sampleName) fq_lib_name = rg_ID dataTag = 'data'+str(sample_lane_counter) if not sampleInfo[sampleName].has_key(dataTag): sampleInfo[sampleName][dataTag] = bundle() sampleInfo[sampleName][dataTag]['fq1'] = fq1 #find fq2 fq2 = fq1 fq2 = fq2.replace("1.fq.gz", "2.fq.gz") if os.path.exists(fq2): sampleInfo[sampleName][dataTag]['fq2'] = fq2 else: logger.warning("%s of line: %d is SE data!" % (sampleName,total_number)) sampleInfo[sampleName][dataTag]['rg'] = rg sampleInfo[sampleName][dataTag]['libname'] = fq_lib_name sampleInfo[sampleName][dataTag]['gender'] = 'male' return sampleInfo
class init(Workflow): """ init data, init data path """ INIT = bundle(hadoop=bundle(), init=bundle()) INIT.init.multiUploader = 'multi_uploader.pl' INIT.init.gzUploader = "gaeatools.jar" INIT.init.bgzip = 'bgzip' INIT.init.perl = 'perl' INIT.init.samtools = 'samtools' INIT.init.qualitysystem = '' INIT.init.check_log = '%s' % os.path.join(os.environ['GAEA_HOME'], 'bin', 'check_log.pl') INIT.init.check_state_param = '' INIT.hadoop.ishadoop2 = False INIT.hadoop.is_at_TH = False INIT.hadoop.fs_mode = 'hdfs' INIT.hadoop.input_format = 'hdfs' INIT.hadoop.mapper_num = '112' INIT.hadoop.reducer_num = '112' def check_qs(self, sampleInfo): for sample_name in sampleInfo: for dataTag in sampleInfo[sample_name]: fq = sampleInfo[sample_name][dataTag]['fq1'] self.init.qualitysystem = qualitysystem.getqualitysystem(fq) if self.init.qualitysystem != '-1': return self.init.qualitysystem if self.init.qualitysystem == '-1': raise RuntimeError('qualitysystem is wrong, the value is -1') def run(self, impl, sampleInfo): mode = self.option.mode result = bundle(output=bundle(), script=bundle()) # extend program path self.init.multiUploader = self.expath('init.multiUploader') self.init.gzUploader = self.expath('init.gzUploader') self.init.check_log = self.expath('init.check_log') self.init.bgzip = self.expath('init.bgzip', False) self.init.samtools = self.expath('init.samtools', False) mapper = [] mapper.append("#!/usr/bin/perl -w") mapper.append("use strict;\n") mapper.append("while(<STDIN>)\n{") mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);") mapper.append("\tif(!-e $tmp[1])\n\t{") mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";") mapper.append("\t\texit 1;\n\t}") mapper.append( "\tsystem(\"%s jar %s GzUploader -i $tmp[1] -o $tmp[2] -n $tmp[3]\");\n}" % (self.hadoop.bin, self.init.gzUploader)) # self.analysisList = self.analysisList[1:] output = bundle() for sample_name in sampleInfo.keys(): raw_data = os.path.join(self.option.dirHDFS, sample_name, 'fq') scriptsdir = impl.mkdir(self.gaeaScriptsDir, sample_name) DataParam = [] sample = sampleInfo[sample_name] output[sample_name] = bundle() # output[sample_name]['outdir'] = bundle() for dataTag in sample.keys(): output[sample_name][dataTag] = bundle() pathTup = impl.splitext(sample[dataTag]['fq1']) filename = '{}_{}_{}'.format(sample_name, dataTag, pathTup[0]) DataParam.append({ "KEY": sample[dataTag]['fq1'], "VALUE": raw_data, "VALUE2": filename }) output[sample_name][dataTag]['fq1'] = os.path.join( raw_data, filename) if not self.init.isSE: pathTup = impl.splitext(sample[dataTag]['fq2']) filename = '{}_{}_{}'.format(sample_name, dataTag, pathTup[0]) DataParam.append({ "KEY": sample[dataTag]['fq2'], "VALUE": raw_data, "VALUE2": filename }) output[sample_name][dataTag]['fq2'] = os.path.join( raw_data, filename) # output[sample_name]['outdir'] = raw_data impl.write_file(fileName='data.list', scriptsdir=scriptsdir, commands=["${KEY}\t${VALUE}\t${VALUE2}"], JobParamList=DataParam) ParamDict = { "PROGRAM": "%s jar %s GzUploader" % (self.hadoop.bin, self.init.gzUploader), "INPUT": os.path.join(scriptsdir, 'data.list'), } # write script scriptPath = \ impl.write_shell( name='init', scriptsdir=scriptsdir, commands=['${PROGRAM} -i ${INPUT} -l'], paramDict=ParamDict) result.script[sample_name] = scriptPath result.output = output if self.init.qualitysystem == '': self.check_qs(sampleInfo) print "[INFO ] -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem else: print "[INFO ] -- qualitysystem is %s --" % self.init.qualitysystem return result
def run(self, impl, sampleInfo): mode = self.option.mode result = bundle(output=bundle()) # extend program path self.init.multiUploader = self.expath('init.multiUploader') self.init.gzUploader = self.expath('init.gzUploader') self.init.check_log = self.expath('init.check_log') self.init.bgzip = self.expath('init.bgzip', False) self.init.samtools = self.expath('init.samtools', False) if self.hadoop.input_format == 'hdfs': if mode != 3 and mode != 4: if self.option.multiSample: sampleName = self.option.multiSampleName scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) self.analysisList = self.analysisList[1:] output = bundle() line = ["${ID}\t${RG}\t${FQ1}\t${FQ2}\t${ADP1}\t${ADP2}"] if self.ref.gender_mode == 'both' and mode != 5: output.female = os.path.join(scriptsdir, "femalesampleinfo.list") output.male = os.path.join(scriptsdir, "malesampleinfo.list") MSLF = open(output.female, 'w') MSLM = open(output.male, 'w') for sample_name in sampleInfo.keys(): sample = sampleInfo[sample_name] LineParam = [] for dataTag in sampleInfo[sample_name].keys(): LineParam.append({ "ID": sample[dataTag]['id'], "RG": sample[dataTag]['rg'], "FQ1": 'file://' + sample[dataTag]['fq1'], "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][ 'fq2'] or 'null', "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][ 'adp1'] or 'null', "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][ 'adp2'] or 'null' }) gender = self.sample[sample_name]["gender"] impl.fileAppend( fh=gender == 'female' and MSLF or MSLM, commands=line, JobParamList=LineParam) else: output.normal = os.path.join(scriptsdir, "sampleinfo.list") MSL = open(output.normal, 'w') for sample_name in sampleInfo.keys(): sample = sampleInfo[sample_name] LineParam = [] for dataTag in sample.keys(): LineParam.append({ "ID": sample[dataTag]['id'], "RG": sample[dataTag]['rg'], "FQ1": 'file://' + sample[dataTag]['fq1'], "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][ 'fq2'] or 'null', "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][ 'adp1'] or 'null', "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][ 'adp2'] or 'null' }) impl.fileAppend( fh=MSL, commands=line, JobParamList=LineParam) result.output[sampleName] = output else: result.script = bundle() for sampleName in sampleInfo.keys(): scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) sample = sampleInfo[sampleName] output = bundle() DataParam = [] cmd = [] cmd.append("source %s/bin/activate" % self.GAEA_HOME) cmd.append("check.py -s %s/state.json -n %s %s" % ( self.stateDir, sampleName, self.init.check_state_param)) for dataTag in sample.keys(): laneData = os.path.join(self.option.dirHDFS, sampleName, 'fq', dataTag) cmd.append('{} -p {}'.format(self.fs_cmd.mkdir, laneData)) output[dataTag] = bundle() pathTup = impl.splitext(sample[dataTag]['fq1']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['fq1'], "VALUE": laneData }) output[dataTag]['fq1'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['fq1'] = sample[dataTag]['fq1'] if self.init.isSE == False: pathTup = impl.splitext(sample[dataTag]['fq2']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['fq2'], "VALUE": laneData }) output[dataTag]['fq2'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['fq2'] = sample[dataTag]['fq2'] if sample[dataTag].has_key('adp1'): pathTup = impl.splitext(sample[dataTag]['adp1']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['adp1'], "VALUE": laneData }) output[dataTag]['adp1'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['adp1'] = sample[dataTag]['adp1'] if sample[dataTag].has_key('adp2'): pathTup = impl.splitext(sample[dataTag]['adp2']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['adp2'], "VALUE": laneData }) output[dataTag]['adp2'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['adp2'] = sample[dataTag]['adp2'] # print DataParam if DataParam: impl.write_file( fileName='data.list', scriptsdir=scriptsdir, commands=["${KEY}\t${VALUE}"], JobParamList=DataParam) ParamDict = { "PROGRAM": self.init.multiUploader, "HADOOP": self.hadoop.bin, "UPLOAD": self.init.gzUploader, "INPUT": os.path.join(scriptsdir, 'data.list') } cmd.append( '%s ${PROGRAM} -b ${HADOOP} -d ${INPUT} -u ${UPLOAD}' % self.init.perl) # write script scriptPath = \ impl.write_shell( name='init', scriptsdir=scriptsdir, commands=cmd, paramDict=ParamDict) result.script[sampleName] = scriptPath result.output[sampleName] = output if self.init.qualitysystem == '': self.check_qs(sampleInfo) print "[INFO ] -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem else: print "[INFO ] -- qualitysystem is %s --" % self.init.qualitysystem # self.init.qualitysystem = 0 else: sampleName = self.option.multiSampleName startStep = self.analysisList[0] fs_type = '' if self.analysisDict[startStep].platform == 'H': fs_type = 'file://' if self.option.multiSample: n = 0 index = 0 inputDir = os.path.join(self.option.workdir, 'raw_data', 'bams_' + str(index)) result.output[sampleName + '_' + str(index)] = fs_type + inputDir if os.path.exists(inputDir): shutil.rmtree(inputDir) impl.mkdir(inputDir) print inputDir for sample_name in sampleInfo: if n == int(self.init.multisample_num): n = 0 index += 1 inputDir = os.path.join(self.option.workdir, 'raw_data', 'bams_' + str(index)) result.output[sampleName + '_' + str(index)] = fs_type + inputDir if os.path.exists(inputDir): shutil.rmtree(inputDir) impl.mkdir(inputDir) print inputDir bam = os.path.basename(sampleInfo[sample_name]) ln_bam = os.path.join(inputDir, bam) os.symlink(sampleInfo[sample_name], ln_bam) n += 1 else: for sample_name in sampleInfo: result.output[sample_name] = fs_type + sampleInfo[sample_name] else: if mode != 3 and mode != 4: if self.option.multiSample: sampleName = self.option.multiSampleName scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) self.analysisList = self.analysisList[1:] output = bundle() line = ["${ID}\t${RG}\t${FQ1}\t${FQ2}\t${ADP1}\t${ADP2}"] if self.ref.gender_mode == 'both' and mode != 5: output.female = os.path.join(scriptsdir, "femalesampleinfo.list") output.male = os.path.join(scriptsdir, "malesampleinfo.list") MSLF = open(output.female, 'w') MSLM = open(output.male, 'w') for sample_name in sampleInfo.keys(): sample = sampleInfo[sample_name] LineParam = [] for dataTag in sampleInfo[sample_name].keys(): LineParam.append({ "ID": sample[dataTag]['id'], "RG": sample[dataTag]['rg'], "FQ1": 'file://' + sample[dataTag]['fq1'], "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][ 'fq2'] or 'null', "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][ 'adp1'] or 'null', "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][ 'adp2'] or 'null' }) gender = self.sample[sample_name]["gender"] impl.fileAppend( fh=gender == 'female' and MSLF or MSLM, commands=line, JobParamList=LineParam) else: output.normal = os.path.join(scriptsdir, "sampleinfo.list") MSL = open(output.normal, 'w') for sample_name in sampleInfo.keys(): sample = sampleInfo[sample_name] LineParam = [] for dataTag in sample.keys(): LineParam.append({ "ID": sample[dataTag]['id'], "RG": sample[dataTag]['rg'], "FQ1": 'file://' + sample[dataTag]['fq1'], "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][ 'fq2'] or 'null', "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][ 'adp1'] or 'null', "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][ 'adp2'] or 'null' }) impl.fileAppend( fh=MSL, commands=line, JobParamList=LineParam) result.output[sampleName] = output else: result.script = bundle() for sampleName in sampleInfo.keys(): scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data', 'gz_tmp') sample = sampleInfo[sampleName] output = bundle() DataParam = [] cmd = [] for dataTag in sample.keys(): rawData = impl.mkdir(self.option.workdir, 'fq', 'raw_data', sampleName) laneData = os.path.join(rawData, dataTag) cmd.append("mkdir -p -m 777 %s" % laneData) output[dataTag] = bundle() pathTup = impl.splitext(sample[dataTag]['fq1']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['fq1'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[dataTag]['fq1'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['fq1'] = sample[dataTag]['fq1'] if self.init.isSE == False: pathTup = impl.splitext(sample[dataTag]['fq2']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['fq2'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[dataTag]['fq2'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['fq2'] = sample[dataTag]['fq2'] if sample[dataTag].has_key('adp1'): pathTup = impl.splitext(sample[dataTag]['adp1']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['adp1'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[dataTag]['adp1'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['adp1'] = sample[dataTag]['adp1'] if sample[dataTag].has_key('adp2'): pathTup = impl.splitext(sample[dataTag]['adp2']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['adp2'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[dataTag]['adp2'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['adp2'] = sample[dataTag]['adp2'] # print DataParam if DataParam: impl.write_file( fileName='data.list', scriptsdir=scriptsdir, commands=["${KEY}\t${VALUE}"], JobParamList=DataParam) mapper = [] mapper.append("#!/usr/bin/perl -w") mapper.append("use strict;\n") mapper.append("while(<STDIN>)\n{") mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);") mapper.append("\tif(!-e $tmp[1])\n\t{") mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";") mapper.append("\t\texit 1;\n\t}") mapper.append("\tsystem(\"gzip -cd $tmp[1] >$tmp[2]\");\n}") impl.write_file( fileName='upload_mapper.pl', scriptsdir=scriptsdir, commands=mapper) hadoop_parameter = ' -D mapred.job.name="gzip input data" ' hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam) hadoop_parameter += ' -D mapred.reduce.tasks=0 ' hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat ' ParamDict = { "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar), "MAPPER": os.path.join(scriptsdir, 'upload_mapper.pl'), "INPUT": 'file://' + os.path.join(scriptsdir, 'data.list'), "OUTPUT": hdfs_gz_tmp, "HADOOPPARAM": hadoop_parameter } cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete) cmd.append( '${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"') # write script scriptPath = \ impl.write_shell( name='init', scriptsdir=scriptsdir, commands=cmd, paramDict=ParamDict) result.script[sampleName] = scriptPath result.output[sampleName] = output if self.init.qualitysystem == '': self.check_qs(sampleInfo) print "[INFO ] -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem else: print "[INFO ] -- qualitysystem is %s --" % self.init.qualitysystem # self.init.qualitysystem = 0 else: sampleName = self.option.multiSampleName startStep = self.analysisList[0] fs_type = '' if self.analysisDict[startStep].platform == 'H': fs_type = 'file://' if self.option.multiSample: inputDir = os.path.join(self.option.workdir, 'raw_data', 'bams') result.output[sampleName] = fs_type + inputDir if os.path.exists(inputDir): shutil.rmtree(inputDir) impl.mkdir(inputDir) for sample_name in sampleInfo.keys(): bam = os.path.basename(sampleInfo[sample_name]) ln_bam = os.path.join(inputDir, sample_name + "_" + bam) os.symlink(sampleInfo[sample_name], ln_bam) else: for sample_name in sampleInfo.keys(): result.output[sample_name] = fs_type + sampleInfo[sample_name] # return return result
class Workflow(object): ''' The basic class of each APP ''' INIT = bundle() fs_cmd = bundle() result = bundle(output=bundle(), script=bundle()) ParamDict = bundle() JobParamList = [] cmd = [] def __init__(self, state): self.__dict__.clear() self.__dict__.update(state) hadoop = self.hadoop.bin if self.hadoop.has_key('fs_mode') and self.hadoop.fs_mode == 'hdfs': if self.hadoop.has_key('ishadoop2'): if isinstance(self.hadoop.ishadoop2, str): if self.hadoop.ishadoop2.upper() == 'FALSE': self.hadoop.ishadoop2 = False else: self.hadoop.ishadoop2 = True if self.hadoop.ishadoop2: self.fs_cmd.delete = "%s fs -rm -r -skipTrash " % hadoop self.fs_cmd.mkdir = "%s fs -mkdir " % hadoop self.fs_cmd.put = "%s fs -put " % hadoop self.fs_cmd.cp = "%s fs -copyToLocal " % hadoop self.fs_cmd.ls = "%s fs -ls " % hadoop else: self.fs_cmd.delete = "%s dfs -rmr -skipTrash " % hadoop self.fs_cmd.mkdir = "%s dfs -mkdir " % hadoop self.fs_cmd.put = "%s dfs -put " % hadoop self.fs_cmd.cp = "%s dfs -copyToLocal " % hadoop self.fs_cmd.ls = "%s dfs -ls " % hadoop elif self.hadoop.has_key('is_at_TH') and self.hadoop.is_at_TH: self.fs_cmd.delete = "rm -rf " self.fs_cmd.mkdir = "mkdir -p " self.fs_cmd.put = "ln -s " self.fs_cmd.cp = "cp -r " self.fs_cmd.ls = "ls -l " else: self.fs_cmd.delete = "rm -rf " self.fs_cmd.mkdir = "mkdir -p " self.fs_cmd.put = "ln -s " self.fs_cmd.cp = "cp -rf " self.fs_cmd.ls = "ls -l " state.fs_cmd = self.fs_cmd def expath(self, paramName, mustBe=True): field = paramName.split('.') state = self.__dict__ if len(field) == 1: path_tmp = state[field[0]] elif len(field) == 2: path_tmp = state[field[0]][field[1]] elif len(field) == 3: path_tmp = state[field[0]][field[1]][field[2]] elif len(field) == 4: path_tmp = state[field[0]][field[1]][field[2]][field[3]] elif len(field) == 5: path_tmp = state[field[0]][field[1]][field[2]][field[3]][field[4]] else: raise RuntimeError('paramName (%s) is wrong!' % paramName) if not path_tmp: if mustBe: raise RuntimeError('Program is not exists: %s' % paramName) else: return '' if os.path.exists(path_tmp): return path_tmp else: for p in self.Path.prgDir.split(':'): if os.path.exists(os.path.join(p, path_tmp)): return os.path.join(p, path_tmp) if mustBe: raise RuntimeError('Program is not exists: %s = %s' % (paramName, path_tmp)) else: return path_tmp def main(self, impl, dependList): self.run(impl, dependList) print self.__class__.__name__ #write script scriptPath = \ impl.write_scripts( name = self.__class__.__name__, commands=self.cmd, JobParamList=self.JobParamList, paramDict=self.ParamDict) #result self.result.script.update(scriptPath) return self.result
def run(self, impl, sampleInfo): mode = self.option.mode result = bundle(output=bundle(), script=bundle()) # extend program path self.init.multiUploader = self.expath('init.multiUploader') self.init.gzUploader = self.expath('init.gzUploader') self.init.check_log = self.expath('init.check_log') self.init.bgzip = self.expath('init.bgzip', False) self.init.samtools = self.expath('init.samtools', False) sampleName = self.option.multiSampleName scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) self.analysisList = self.analysisList[1:] hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data', 'gz_tmp') tmp = impl.mkdir(self.option.workdir, "temp", sampleName, 'ubam') rawData = impl.mkdir(self.option.workdir, "ubam", sampleName) ubam = [] DataParam = [] output = bundle() cmd = [] for sample_name in sampleInfo.keys(): sample = sampleInfo[sample_name] output[sample_name] = bundle() for dataTag in sample.keys(): output[sample_name][dataTag] = bundle() filename = '{}_{}.bam'.format(sample_name, dataTag) output[sample_name][dataTag]['bam'] = os.path.join( rawData, filename) ubam.append(output[sample_name][dataTag]['bam']) DataParam.append({ "KEY1": sample[dataTag]['fq1'], "KEY2": sample[dataTag]['fq2'], "KEY3": output[sample_name][dataTag]['bam'], "KEY4": sample_name, "KEY5": sample_name + "_" + dataTag }) if DataParam: impl.write_file( fileName='data.list', scriptsdir=scriptsdir, commands=["${KEY1}\t${KEY2}\t${KEY3}\t${KEY4}\t${KEY5}"], JobParamList=DataParam) mapper = [] mapper.append("#!/usr/bin/perl -w") mapper.append("use strict;\n") mapper.append("while(<STDIN>)\n{") mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);") mapper.append("\tif(!-e $tmp[1])\n\t{") mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";") mapper.append("\t\texit 1;\n\t}") mapper.append( "\tsystem(\"%s FastqToSam -F1 $tmp[1] -F2 $tmp[2] -O $tmp[3] -SM $tmp[4] -RG $tmp[5] --TMP_DIR %s -PL illumina\");\n}" % (self.init.gatk, tmp)) impl.write_file(fileName='upload_mapper.pl', scriptsdir=scriptsdir, commands=mapper) hadoop_parameter = ' -D mapred.job.name="upload data" ' if self.hadoop.get('queue'): hadoop_parameter += '-D mapreduce.job.queuename={} '.format( self.hadoop.queue) hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam) hadoop_parameter += '-D mapreduce.map.memory.mb=10240 ' hadoop_parameter += ' -D mapred.reduce.tasks=0 ' hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat ' ParamDict = { "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar), "MAPPER": os.path.join(scriptsdir, 'upload_mapper.pl'), "INPUT": 'file://' + os.path.join(scriptsdir, 'data.list'), "OUTPUT": hdfs_gz_tmp, "HADOOPPARAM": hadoop_parameter } cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete) cmd.append( '${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"' ) # cmd.append('%s jar %s GzUploader -i %s -l' % ( # self.hadoop.bin, self.init.gzUploader, os.path.join(scriptsdir, 'data.list'))) # write script scriptPath = \ impl.write_shell( name='init', scriptsdir=scriptsdir, commands=cmd, paramDict=ParamDict) result.script[sampleName] = scriptPath result.output = output if self.init.qualitysystem == '': self.check_qs(sampleInfo) print "[INFO ] -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem else: print "[INFO ] -- qualitysystem is %s --" % self.init.qualitysystem return result
def run(self, impl, sampleInfo): mode = self.option.mode result = bundle(output=bundle(), script=bundle()) # extend program path self.init.multiUploader = self.expath('init.multiUploader') self.init.gzUploader = self.expath('init.gzUploader') self.init.check_log = self.expath('init.check_log') self.init.bgzip = self.expath('init.bgzip', False) self.init.samtools = self.expath('init.samtools', False) sampleName = self.option.multiSampleName scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) self.analysisList = self.analysisList[1:] hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data', 'gz_tmp') # result.output[sampleName] = output DataParam = [] output = bundle() cmd = [] for sample_name in sampleInfo.keys(): sample = sampleInfo[sample_name] output[sample_name] = bundle() for dataTag in sample.keys(): rawData = impl.mkdir(self.option.workdir, 'fq', 'raw_data', sample_name) laneData = os.path.join(rawData, dataTag) cmd.append("mkdir -p -m 777 %s" % laneData) output[sample_name][dataTag] = bundle() pathTup = impl.splitext(sample[dataTag]['fq1']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['fq1'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[sample_name][dataTag]['fq1'] = os.path.join(laneData, pathTup[0]) else: output[sample_name][dataTag]['fq1'] = sample[dataTag]['fq1'] if self.init.isSE == False: pathTup = impl.splitext(sample[dataTag]['fq2']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['fq2'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[sample_name][dataTag]['fq2'] = os.path.join(laneData, pathTup[0]) else: output[sample_name][dataTag]['fq2'] = sample[dataTag]['fq2'] if sample[dataTag].has_key('adp1'): pathTup = impl.splitext(sample[dataTag]['adp1']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['adp1'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[sample_name][dataTag]['adp1'] = os.path.join(laneData, pathTup[0]) else: output[sample_name][dataTag]['adp1'] = sample[dataTag]['adp1'] if sample[dataTag].has_key('adp2'): pathTup = impl.splitext(sample[dataTag]['adp2']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['adp2'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[sample_name][dataTag]['adp2'] = os.path.join(laneData, pathTup[0]) else: output[sample_name][dataTag]['adp2'] = sample[dataTag]['adp2'] # print DataParam if DataParam: impl.write_file( fileName='data.list', scriptsdir=scriptsdir, commands=["${KEY}\t${VALUE}"], JobParamList=DataParam) mapper = [] mapper.append("#!/usr/bin/perl -w") mapper.append("use strict;\n") mapper.append("while(<STDIN>)\n{") mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);") mapper.append("\tif(!-e $tmp[1])\n\t{") mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";") mapper.append("\t\texit 1;\n\t}") mapper.append("\tsystem(\"gzip -cd $tmp[1] >$tmp[2]\");\n}") impl.write_file( fileName='upload_mapper.pl', scriptsdir=scriptsdir, commands=mapper) hadoop_parameter = ' -D mapred.job.name="gzip input data" ' if self.hadoop.get('queue'): hadoop_parameter += '-D mapreduce.job.queuename={} '.format(self.hadoop.queue) hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam) hadoop_parameter += ' -D mapred.reduce.tasks=0 ' hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat ' ParamDict = { "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar), "MAPPER": os.path.join(scriptsdir, 'upload_mapper.pl'), "INPUT": 'file://' + os.path.join(scriptsdir, 'data.list'), "OUTPUT": hdfs_gz_tmp, "HADOOPPARAM": hadoop_parameter } cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete) cmd.append('${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"') # write script scriptPath = \ impl.write_shell( name='init', scriptsdir=scriptsdir, commands=cmd, paramDict=ParamDict) result.script[sampleName] = scriptPath result.output = output if self.init.qualitySystem == '': self.check_qs(sampleInfo) print "[INFO ] -- qualitySystem is %s (autocheck)--" % self.init.qualitySystem else: print "[INFO ] -- qualitySystem is %s --" % self.init.qualitySystem return result
def run(self, impl, sampleInfo): mode = self.option.mode result = bundle(output=bundle(), script=bundle()) # extend program path self.init.multiUploader = self.expath('init.multiUploader') self.init.gzUploader = self.expath('init.gzUploader') self.init.check_log = self.expath('init.check_log') self.init.bgzip = self.expath('init.bgzip', False) self.init.samtools = self.expath('init.samtools', False) mapper = [] mapper.append("#!/usr/bin/perl -w") mapper.append("use strict;\n") mapper.append("while(<STDIN>)\n{") mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);") mapper.append("\tif(!-e $tmp[1])\n\t{") mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";") mapper.append("\t\texit 1;\n\t}") mapper.append( "\tsystem(\"%s jar %s GzUploader -i $tmp[1] -o $tmp[2] -n $tmp[3]\");\n}" % (self.hadoop.bin, self.init.gzUploader)) # self.analysisList = self.analysisList[1:] output = bundle() DataParam = [] for sample_name in sampleInfo.keys(): raw_data = os.path.join(self.option.dirHDFS, sample_name, 'fq') scriptsdir = impl.mkdir(self.gaeaScriptsDir, sample_name) hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sample_name, 'data', 'gz_tmp') sample = sampleInfo[sample_name] output[sample_name] = bundle() for dataTag in sample.keys(): output[sample_name][dataTag] = bundle() pathTup = impl.splitext(sample[dataTag]['fq1']) filename = '{}_{}_{}'.format(sample_name, dataTag, pathTup[0]) DataParam.append({ "KEY": sample[dataTag]['fq1'], "VALUE": raw_data, "VALUE2": filename }) output[sample_name][dataTag]['fq1'] = os.path.join( raw_data, filename) if not self.init.isSE: pathTup = impl.splitext(sample[dataTag]['fq2']) filename = '{}_{}_{}'.format(sample_name, dataTag, pathTup[0]) DataParam.append({ "KEY": sample[dataTag]['fq2'], "VALUE": raw_data, "VALUE2": filename }) output[sample_name][dataTag]['fq2'] = os.path.join( raw_data, filename) impl.write_file(fileName='upload_mapper.pl', scriptsdir=scriptsdir, commands=mapper) hadoop_parameter = ' -D mapred.job.name="upload data" ' if self.hadoop.get('queue'): hadoop_parameter += '-D mapreduce.job.queuename={} '.format( self.hadoop.queue) hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam) hadoop_parameter += ' -D mapred.reduce.tasks=0 ' hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat ' ParamDict = { "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar), "MAPPER": os.path.join(scriptsdir, 'upload_mapper.pl'), "INPUT": 'file://' + os.path.join(self.gaeaScriptsDir, 'data.list'), "OUTPUT": hdfs_gz_tmp, "HADOOPPARAM": hadoop_parameter } # write script scriptPath = \ impl.write_shell( name='init', scriptsdir=scriptsdir, commands=['${PROGRAM} -i ${INPUT} -l'], paramDict=ParamDict) result.script[sample_name] = scriptPath impl.write_file(fileName='data.list', scriptsdir=self.gaeaScriptsDir, commands=["${KEY}\t${VALUE}\t${VALUE2}"], JobParamList=DataParam) result.output = output if self.init.qualitysystem == '': self.check_qs(sampleInfo) print "[INFO ] -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem else: print "[INFO ] -- qualitysystem is %s --" % self.init.qualitysystem return result
def run(self, impl, sampleInfo): mode = self.option.mode result = bundle(output=bundle(), script=bundle()) # extend program path self.init.multiUploader = self.expath('init.multiUploader') self.init.gzUploader = self.expath('init.gzUploader') self.init.check_log = self.expath('init.check_log') self.init.bgzip = self.expath('init.bgzip', False) self.init.samtools = self.expath('init.samtools', False) mapper = [] mapper.append("#!/usr/bin/perl -w") mapper.append("use strict;\n") mapper.append("while(<STDIN>)\n{") mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);") mapper.append("\tif(!-e $tmp[1])\n\t{") mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";") mapper.append("\t\texit 1;\n\t}") mapper.append( "\tsystem(\"%s jar %s GzUploader -i $tmp[1] -o $tmp[2] -n $tmp[3]\");\n}" % (self.hadoop.bin, self.init.gzUploader)) # self.analysisList = self.analysisList[1:] output = bundle() for sample_name in sampleInfo.keys(): raw_data = os.path.join(self.option.dirHDFS, sample_name, 'fq') scriptsdir = impl.mkdir(self.gaeaScriptsDir, sample_name) DataParam = [] sample = sampleInfo[sample_name] output[sample_name] = bundle() # output[sample_name]['outdir'] = bundle() for dataTag in sample.keys(): output[sample_name][dataTag] = bundle() pathTup = impl.splitext(sample[dataTag]['fq1']) filename = '{}_{}_{}'.format(sample_name, dataTag, pathTup[0]) DataParam.append({ "KEY": sample[dataTag]['fq1'], "VALUE": raw_data, "VALUE2": filename }) output[sample_name][dataTag]['fq1'] = os.path.join( raw_data, filename) if not self.init.isSE: pathTup = impl.splitext(sample[dataTag]['fq2']) filename = '{}_{}_{}'.format(sample_name, dataTag, pathTup[0]) DataParam.append({ "KEY": sample[dataTag]['fq2'], "VALUE": raw_data, "VALUE2": filename }) output[sample_name][dataTag]['fq2'] = os.path.join( raw_data, filename) # output[sample_name]['outdir'] = raw_data impl.write_file(fileName='data.list', scriptsdir=scriptsdir, commands=["${KEY}\t${VALUE}\t${VALUE2}"], JobParamList=DataParam) ParamDict = { "PROGRAM": "%s jar %s GzUploader" % (self.hadoop.bin, self.init.gzUploader), "INPUT": os.path.join(scriptsdir, 'data.list'), } # write script scriptPath = \ impl.write_shell( name='init', scriptsdir=scriptsdir, commands=['${PROGRAM} -i ${INPUT} -l'], paramDict=ParamDict) result.script[sample_name] = scriptPath result.output = output if self.init.qualitysystem == '': self.check_qs(sampleInfo) print "[INFO ] -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem else: print "[INFO ] -- qualitysystem is %s --" % self.init.qualitysystem return result
def parse_sample(sampleList): total_number = 0 with open(sampleList, 'r') as f: sampleInfo = bundle() for line in f: total_number += 1 line = line.strip() field = line.split() sampleName = field[0] fq1 = field[1] fq2 = '' if len(field) >= 3: fq2 = field[2] if os.path.exists(fq1): logger.error("%s under %s don't exists." % (fq1, sampleName)) exit(3) fq_dir = os.path.dirname(fq1) fq_name = os.path.basename(fq1) #date_md_flowcell_laneID_lib #100920_I126_FC801V9ABXX_L6_HUMlatXAOIDCBAPEI-8_2.fq tmp = fq_name.split("_") rg_ID = tmp[4] + "_" + tmp[2] + "-" + tmp[3] rg_PU = tmp[0] + "_" + tmp[1] + "_" + tmp[2] + "_" + tmp[ 3] + "_" + tmp[4] rg_LB = tmp[4] rg = "@RG\\tID:%s\\tPL:illumina\\tPU:%s\\tLB:%s\\tSM:%s\\tCN:BGI" % ( rg_ID, rg_PU, rg_LB, sampleName) fq_lib_name = rg_ID if not sampleInfo.has_key(sampleName): sampleInfo[sampleName] = bundle() sample_lane_counter = 0 else: sample_lane_counter = len(sampleInfo[sampleName]) dataTag = 'data' + str(sample_lane_counter) if not sampleInfo[sampleName].has_key(dataTag): sampleInfo[sampleName][dataTag] = bundle() #find adp1 sampleInfo[sampleName][dataTag]['fq1'] = fq1 adp1 = glob.glob("%s/*1.adapter.list*" % fq_dir) if adp1: adp1_file = adp1[0].strip() sampleInfo[sampleName][dataTag]['adp1'] = adp1_file else: sampleInfo[sampleName][dataTag]['adp1'] = 'null' #find fq2 and adp2 fq2 = fq1 fq2 = fq2.replace("1.fq.gz", "2.fq.gz") if os.path.exists(fq2): sampleInfo[sampleName][dataTag]['fq2'] = fq2 adp2 = glob.glob("%s/*2.adapter.list*" % fq_dir) if adp2: adp2_file = adp2[0].strip() sampleInfo[sampleName][dataTag]['adp2'] = adp2_file else: sampleInfo[sampleName][dataTag]['adp2'] = 'null' else: logger.warning("%s of line: %d is SE data!" % (sampleName, total_number)) sampleInfo[sampleName][dataTag]['rg'] = rg sampleInfo[sampleName][dataTag]['libname'] = fq_lib_name return sampleInfo
def run(self, impl, sampleInfo): mode = self.option.mode result = bundle(output=bundle()) # extend program path self.init.multiUploader = self.expath('init.multiUploader') self.init.gzUploader = self.expath('init.gzUploader') self.init.check_log = self.expath('init.check_log') self.init.bgzip = self.expath('init.bgzip', False) self.init.samtools = self.expath('init.samtools', False) if self.hadoop.input_format == 'hdfs': if mode != 3 and mode != 4: if self.option.multiSample: sampleName = self.option.multiSampleName scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) self.analysisList = self.analysisList[1:] output = bundle() line = ["${ID}\t${RG}\t${FQ1}\t${FQ2}\t${ADP1}\t${ADP2}"] if self.ref.gender_mode == 'both' and mode != 5: output.female = os.path.join(scriptsdir, "femalesampleinfo.list") output.male = os.path.join(scriptsdir, "malesampleinfo.list") MSLF = open(output.female, 'w') MSLM = open(output.male, 'w') for sample_name in sampleInfo.keys(): sample = sampleInfo[sample_name] LineParam = [] for dataTag in sampleInfo[sample_name].keys(): LineParam.append({ "ID": sample[dataTag]['id'], "RG": sample[dataTag]['rg'], "FQ1": 'file://' + sample[dataTag]['fq1'], "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][ 'fq2'] or 'null', "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][ 'adp1'] or 'null', "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][ 'adp2'] or 'null' }) gender = self.sample[sample_name]["gender"] impl.fileAppend( fh=gender == 'female' and MSLF or MSLM, commands=line, JobParamList=LineParam) else: output.normal = os.path.join(scriptsdir, "sampleinfo.list") MSL = open(output.normal, 'w') for sample_name in sampleInfo.keys(): sample = sampleInfo[sample_name] LineParam = [] for dataTag in sample.keys(): LineParam.append({ "ID": sample[dataTag]['id'], "RG": sample[dataTag]['rg'], "FQ1": 'file://' + sample[dataTag]['fq1'], "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][ 'fq2'] or 'null', "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][ 'adp1'] or 'null', "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][ 'adp2'] or 'null' }) impl.fileAppend( fh=MSL, commands=line, JobParamList=LineParam) result.output[sampleName] = output else: result.script = bundle() for sampleName in sampleInfo.keys(): scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) sample = sampleInfo[sampleName] output = bundle() DataParam = [] cmd = [] cmd.append("source %s/bin/activate" % self.GAEA_HOME) cmd.append("check.py -s %s/state.json -n %s %s" % ( self.stateDir, sampleName, self.init.check_state_param)) for dataTag in sample.keys(): laneData = os.path.join(self.option.dirHDFS, sampleName, 'fq', dataTag) cmd.append('{} -p {}'.format(self.fs_cmd.mkdir, laneData)) output[dataTag] = bundle() pathTup = impl.splitext(sample[dataTag]['fq1']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['fq1'], "VALUE": laneData }) output[dataTag]['fq1'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['fq1'] = sample[dataTag]['fq1'] if self.init.isSE == False: pathTup = impl.splitext(sample[dataTag]['fq2']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['fq2'], "VALUE": laneData }) output[dataTag]['fq2'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['fq2'] = sample[dataTag]['fq2'] if sample[dataTag].has_key('adp1'): pathTup = impl.splitext(sample[dataTag]['adp1']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['adp1'], "VALUE": laneData }) output[dataTag]['adp1'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['adp1'] = sample[dataTag]['adp1'] if sample[dataTag].has_key('adp2'): pathTup = impl.splitext(sample[dataTag]['adp2']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['adp2'], "VALUE": laneData }) output[dataTag]['adp2'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['adp2'] = sample[dataTag]['adp2'] # print DataParam if DataParam: impl.write_file( fileName='data.list', scriptsdir=scriptsdir, commands=["${KEY}\t${VALUE}"], JobParamList=DataParam) ParamDict = { "PROGRAM": self.init.multiUploader, "HADOOP": self.hadoop.bin, "UPLOAD": self.init.gzUploader, "INPUT": os.path.join(scriptsdir, 'data.list') } cmd.append( '%s ${PROGRAM} -b ${HADOOP} -d ${INPUT} -u ${UPLOAD}' % self.init.perl) # write script scriptPath = \ impl.write_shell( name='init', scriptsdir=scriptsdir, commands=cmd, paramDict=ParamDict) result.script[sampleName] = scriptPath result.output[sampleName] = output if self.init.qualitySystem == '': self.check_qs(sampleInfo) print "[INFO ] -- qualitySystem is %s (autocheck)--" % self.init.qualitySystem else: print "[INFO ] -- qualitySystem is %s --" % self.init.qualitySystem # self.init.qualitySystem = 0 else: sampleName = self.option.multiSampleName startStep = self.analysisList[0] fs_type = '' if self.analysisDict[startStep].platform == 'H': fs_type = 'file://' if self.option.multiSample: n = 0 index = 0 inputDir = os.path.join(self.option.workdir, 'raw_data', 'bams_' + str(index)) result.output[sampleName + '_' + str(index)] = fs_type + inputDir if os.path.exists(inputDir): shutil.rmtree(inputDir) impl.mkdir(inputDir) print inputDir for sample_name in sampleInfo: if n == int(self.init.multisample_num): n = 0 index += 1 inputDir = os.path.join(self.option.workdir, 'raw_data', 'bams_' + str(index)) result.output[sampleName + '_' + str(index)] = fs_type + inputDir if os.path.exists(inputDir): shutil.rmtree(inputDir) impl.mkdir(inputDir) print inputDir bam = os.path.basename(sampleInfo[sample_name]) ln_bam = os.path.join(inputDir, bam) os.symlink(sampleInfo[sample_name], ln_bam) n += 1 else: for sample_name in sampleInfo: result.output[sample_name] = fs_type + sampleInfo[sample_name] else: if mode != 3 and mode != 4: if self.option.multiSample: sampleName = self.option.multiSampleName scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) self.analysisList = self.analysisList[1:] output = bundle() line = ["${ID}\t${RG}\t${FQ1}\t${FQ2}\t${ADP1}\t${ADP2}"] if self.ref.gender_mode == 'both' and mode != 5: output.female = os.path.join(scriptsdir, "femalesampleinfo.list") output.male = os.path.join(scriptsdir, "malesampleinfo.list") MSLF = open(output.female, 'w') MSLM = open(output.male, 'w') for sample_name in sampleInfo.keys(): sample = sampleInfo[sample_name] LineParam = [] for dataTag in sampleInfo[sample_name].keys(): LineParam.append({ "ID": sample[dataTag]['id'], "RG": sample[dataTag]['rg'], "FQ1": 'file://' + sample[dataTag]['fq1'], "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][ 'fq2'] or 'null', "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][ 'adp1'] or 'null', "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][ 'adp2'] or 'null' }) gender = self.sample[sample_name]["gender"] impl.fileAppend( fh=gender == 'female' and MSLF or MSLM, commands=line, JobParamList=LineParam) else: output.normal = os.path.join(scriptsdir, "sampleinfo.list") MSL = open(output.normal, 'w') for sample_name in sampleInfo.keys(): sample = sampleInfo[sample_name] LineParam = [] for dataTag in sample.keys(): LineParam.append({ "ID": sample[dataTag]['id'], "RG": sample[dataTag]['rg'], "FQ1": 'file://' + sample[dataTag]['fq1'], "FQ2": sample[dataTag].has_key('fq2') and 'file://' + sample[dataTag][ 'fq2'] or 'null', "ADP1": sample[dataTag].has_key('adp1') and 'file://' + sample[dataTag][ 'adp1'] or 'null', "ADP2": sample[dataTag].has_key('adp2') and 'file://' + sample[dataTag][ 'adp2'] or 'null' }) impl.fileAppend( fh=MSL, commands=line, JobParamList=LineParam) result.output[sampleName] = output else: result.script = bundle() for sampleName in sampleInfo.keys(): scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data', 'gz_tmp') sample = sampleInfo[sampleName] output = bundle() DataParam = [] cmd = [] for dataTag in sample.keys(): rawData = impl.mkdir(self.option.workdir, 'fq', 'raw_data', sampleName) laneData = os.path.join(rawData, dataTag) cmd.append("mkdir -p -m 777 %s" % laneData) output[dataTag] = bundle() pathTup = impl.splitext(sample[dataTag]['fq1']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['fq1'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[dataTag]['fq1'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['fq1'] = sample[dataTag]['fq1'] if self.init.isSE == False: pathTup = impl.splitext(sample[dataTag]['fq2']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['fq2'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[dataTag]['fq2'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['fq2'] = sample[dataTag]['fq2'] if sample[dataTag].has_key('adp1'): pathTup = impl.splitext(sample[dataTag]['adp1']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['adp1'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[dataTag]['adp1'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['adp1'] = sample[dataTag]['adp1'] if sample[dataTag].has_key('adp2'): pathTup = impl.splitext(sample[dataTag]['adp2']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['adp2'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[dataTag]['adp2'] = os.path.join(laneData, pathTup[0]) else: output[dataTag]['adp2'] = sample[dataTag]['adp2'] # print DataParam if DataParam: impl.write_file( fileName='data.list', scriptsdir=scriptsdir, commands=["${KEY}\t${VALUE}"], JobParamList=DataParam) mapper = [] mapper.append("#!/usr/bin/perl -w") mapper.append("use strict;\n") mapper.append("while(<STDIN>)\n{") mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);") mapper.append("\tif(!-e $tmp[1])\n\t{") mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";") mapper.append("\t\texit 1;\n\t}") mapper.append("\tsystem(\"gzip -cd $tmp[1] >$tmp[2]\");\n}") impl.write_file( fileName='upload_mapper.pl', scriptsdir=scriptsdir, commands=mapper) hadoop_parameter = ' -D mapred.job.name="gzip input data" ' hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam) hadoop_parameter += ' -D mapred.reduce.tasks=0 ' hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat ' ParamDict = { "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar), "MAPPER": os.path.join(scriptsdir, 'upload_mapper.pl'), "INPUT": 'file://' + os.path.join(scriptsdir, 'data.list'), "OUTPUT": hdfs_gz_tmp, "HADOOPPARAM": hadoop_parameter } cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete) cmd.append( '${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"') # write script scriptPath = \ impl.write_shell( name='init', scriptsdir=scriptsdir, commands=cmd, paramDict=ParamDict) result.script[sampleName] = scriptPath result.output[sampleName] = output if self.init.qualitySystem == '': self.check_qs(sampleInfo) print "[INFO ] -- qualitySystem is %s (autocheck)--" % self.init.qualitySystem else: print "[INFO ] -- qualitySystem is %s --" % self.init.qualitySystem # self.init.qualitySystem = 0 else: sampleName = self.option.multiSampleName startStep = self.analysisList[0] fs_type = '' if self.analysisDict[startStep].platform == 'H': fs_type = 'file://' if self.option.multiSample: inputDir = os.path.join(self.option.workdir, 'raw_data', 'bams') result.output[sampleName] = fs_type + inputDir if os.path.exists(inputDir): shutil.rmtree(inputDir) impl.mkdir(inputDir) for sample_name in sampleInfo.keys(): bam = os.path.basename(sampleInfo[sample_name]) ln_bam = os.path.join(inputDir, sample_name + "_" + bam) os.symlink(sampleInfo[sample_name], ln_bam) else: for sample_name in sampleInfo.keys(): result.output[sample_name] = fs_type + sampleInfo[sample_name] # return return result
def parse_sample(sampleList): total_number = 0 with open(sampleList,'r') as f: sampleInfo = bundle() for line in f: total_number += 1 line = line.strip() field = line.split() sampleName = field[0] fq1 = field[1] fq2 = '' if len(field) >= 3: fq2 = field[2] if os.path.exists(fq1): logger.error("%s under %s don't exists." % (fq1,sampleName)) exit(3) fq_dir = os.path.dirname(fq1) fq_name = os.path.basename(fq1) #date_md_flowcell_laneID_lib #100920_I126_FC801V9ABXX_L6_HUMlatXAOIDCBAPEI-8_2.fq tmp = fq_name.split("_") rg_ID = tmp[4]+"_"+tmp[2]+"-"+tmp[3] rg_PU = tmp[0]+"_"+tmp[1]+"_"+tmp[2]+"_"+tmp[3]+"_"+tmp[4] rg_LB = tmp[4] rg = "@RG\\tID:%s\\tPL:illumina\\tPU:%s\\tLB:%s\\tSM:%s\\tCN:BGI" % (rg_ID,rg_PU,rg_LB,sampleName) fq_lib_name = rg_ID if not sampleInfo.has_key(sampleName): sampleInfo[sampleName] = bundle() sample_lane_counter = 0 else: sample_lane_counter = len(sampleInfo[sampleName]) dataTag = 'data'+str(sample_lane_counter) if not sampleInfo[sampleName].has_key(dataTag): sampleInfo[sampleName][dataTag] = bundle() #find adp1 sampleInfo[sampleName][dataTag]['fq1'] = fq1 adp1 = glob.glob("%s/*1.adapter.list*" % fq_dir) if adp1: adp1_file = adp1[0].strip() sampleInfo[sampleName][dataTag]['adp1'] = adp1_file else: sampleInfo[sampleName][dataTag]['adp1'] = 'null' #find fq2 and adp2 fq2 = fq1 fq2 = fq2.replace("1.fq.gz", "2.fq.gz") if os.path.exists(fq2): sampleInfo[sampleName][dataTag]['fq2'] = fq2 adp2 = glob.glob("%s/*2.adapter.list*" % fq_dir) if adp2: adp2_file = adp2[0].strip() sampleInfo[sampleName][dataTag]['adp2'] = adp2_file else: sampleInfo[sampleName][dataTag]['adp2'] = 'null' else: logger.warning("%s of line: %d is SE data!" % (sampleName,total_number)) sampleInfo[sampleName][dataTag]['rg'] = rg sampleInfo[sampleName][dataTag]['libname'] = fq_lib_name return sampleInfo
def run(self, impl, sampleInfo): mode = self.option.mode result = bundle(output=bundle(), script=bundle()) # extend program path self.init.multiUploader = self.expath('init.multiUploader') self.init.gzUploader = self.expath('init.gzUploader') self.init.check_log = self.expath('init.check_log') self.init.bgzip = self.expath('init.bgzip', False) self.init.samtools = self.expath('init.samtools', False) sampleName = self.option.multiSampleName scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) self.analysisList = self.analysisList[1:] hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data', 'gz_tmp') # result.output[sampleName] = output DataParam = [] output = bundle() cmd = [] for sample_name in sampleInfo.keys(): sample = sampleInfo[sample_name] output[sample_name] = bundle() for dataTag in sample.keys(): rawData = impl.mkdir(self.option.workdir, 'fq', 'raw_data', sample_name) laneData = os.path.join(rawData, dataTag) cmd.append("mkdir -p -m 777 %s" % laneData) output[sample_name][dataTag] = bundle() pathTup = impl.splitext(sample[dataTag]['fq1']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['fq1'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[sample_name][dataTag]['fq1'] = os.path.join( laneData, pathTup[0]) else: output[sample_name][dataTag]['fq1'] = sample[dataTag][ 'fq1'] if self.init.isSE == False: pathTup = impl.splitext(sample[dataTag]['fq2']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['fq2'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[sample_name][dataTag]['fq2'] = os.path.join( laneData, pathTup[0]) else: output[sample_name][dataTag]['fq2'] = sample[dataTag][ 'fq2'] if sample[dataTag].has_key('adp1'): pathTup = impl.splitext(sample[dataTag]['adp1']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['adp1'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[sample_name][dataTag]['adp1'] = os.path.join( laneData, pathTup[0]) else: output[sample_name][dataTag]['adp1'] = sample[dataTag][ 'adp1'] if sample[dataTag].has_key('adp2'): pathTup = impl.splitext(sample[dataTag]['adp2']) if pathTup and pathTup[1] == '.gz': DataParam.append({ "KEY": sample[dataTag]['adp2'], "VALUE": os.path.join(laneData, pathTup[0]) }) output[sample_name][dataTag]['adp2'] = os.path.join( laneData, pathTup[0]) else: output[sample_name][dataTag]['adp2'] = sample[dataTag][ 'adp2'] # print DataParam if DataParam: impl.write_file(fileName='data.list', scriptsdir=scriptsdir, commands=["${KEY}\t${VALUE}"], JobParamList=DataParam) mapper = [] mapper.append("#!/usr/bin/perl -w") mapper.append("use strict;\n") mapper.append("while(<STDIN>)\n{") mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);") mapper.append("\tif(!-e $tmp[1])\n\t{") mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";") mapper.append("\t\texit 1;\n\t}") mapper.append("\tsystem(\"gzip -cd $tmp[1] >$tmp[2]\");\n}") impl.write_file(fileName='upload_mapper.pl', scriptsdir=scriptsdir, commands=mapper) hadoop_parameter = ' -D mapred.job.name="gzip input data" ' if self.hadoop.get('queue'): hadoop_parameter += '-D mapreduce.job.queuename={} '.format( self.hadoop.queue) hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam) hadoop_parameter += ' -D mapred.reduce.tasks=0 ' hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat ' ParamDict = { "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar), "MAPPER": os.path.join(scriptsdir, 'upload_mapper.pl'), "INPUT": 'file://' + os.path.join(scriptsdir, 'data.list'), "OUTPUT": hdfs_gz_tmp, "HADOOPPARAM": hadoop_parameter } cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete) cmd.append( '${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"' ) # write script scriptPath = \ impl.write_shell( name='init', scriptsdir=scriptsdir, commands=cmd, paramDict=ParamDict) result.script[sampleName] = scriptPath result.output = output if self.init.qualitysystem == '': self.check_qs(sampleInfo) print "[INFO ] -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem else: print "[INFO ] -- qualitysystem is %s --" % self.init.qualitysystem return result
def parse_sample(sampleList): total_number = 0 sampleInfo = bundle() with open(sampleList,'r') as f: for line in f: field = line.strip().split() sampleName = field[0] rg_LB = field[1] rg_PU = field[2] fq_paths = field[-1].strip() fq1s = [] fq_tmp = fq_paths.split(";") for fq in fq_tmp: if fq.endswith("1.fq.gz"): fq1s.append(fq) # fq1s = filter(lambda x: x.endswith("1.fq.gz"), fq_tmp) if len(fq1s) == 0 or not os.path.exists(fq1s[0]): logger.error("fq1 under %s don't exists." % sampleName) exit(3) for fq1 in fq1s: total_number += 1 if not sampleInfo.has_key(sampleName): sampleInfo[sampleName] = bundle() sample_lane_counter = 0 else: sample_lane_counter = len(sampleInfo[sampleName]) # fq_name = os.path.basename(fq1) # fq_dir = os.path.abspath(os.path.dirname(fq1)) # slideID_laneID_barcode # CL100035764_L02_33_1.fq.gz # tmp = fq_name.split("_") # rg_PU = tmp[0] + "_" + tmp[1] + "_" + tmp[2] rg_ID = "{}_{}".format(sampleName, sample_lane_counter) rg = "@RG\\tID:%s\\tPL:COMPLETE\\tPU:%s\\tLB:%s\\tSM:%s\\tCN:BGI" % (rg_ID, rg_PU, rg_LB, sampleName) fq_lib_name = rg_ID dataTag = 'data'+str(sample_lane_counter) if not sampleInfo[sampleName].has_key(dataTag): sampleInfo[sampleName][dataTag] = bundle() sampleInfo[sampleName][dataTag]['fq1'] = fq1 #find fq2 fq2 = fq1 fq2 = fq2.replace("1.fq.gz", "2.fq.gz") if os.path.exists(fq2): sampleInfo[sampleName][dataTag]['fq2'] = fq2 else: logger.warning("%s of line: %d is SE data!" % (sampleName,total_number)) sampleInfo[sampleName][dataTag]['rg'] = rg sampleInfo[sampleName][dataTag]['libname'] = fq_lib_name sampleInfo[sampleName][dataTag]['gender'] = 'male' return sampleInfo
class init(Workflow): """ init data, init data path """ INIT = bundle(hadoop=bundle(), init=bundle()) INIT.init.multiUploader = 'multi_uploader.pl' INIT.init.gzUploader = "GzUpload.jar" INIT.init.bgzip = 'bgzip' INIT.init.perl = 'perl' INIT.init.samtools = 'samtools' INIT.init.qualitysystem = '' INIT.init.check_log = '%s' % os.path.join(os.environ['GAEA_HOME'], 'bin', 'check_log.pl') INIT.init.check_state_param = '' INIT.hadoop.ishadoop2 = False INIT.hadoop.is_at_TH = False INIT.hadoop.fs_mode = 'hdfs' INIT.hadoop.input_format = 'file' INIT.hadoop.mapper_num = '112' INIT.hadoop.reducer_num = '112' def check_qs(self, sampleInfo): for sample_name in sampleInfo: for dataTag in sampleInfo[sample_name]: fq = sampleInfo[sample_name][dataTag]['fq1'] self.init.qualitysystem = qualitysystem.getqualitysystem(fq) if self.init.qualitysystem != '-1': return self.init.qualitysystem if self.init.qualitysystem == '-1': raise RuntimeError('qualitysystem is wrong, the value is -1') def run(self, impl, sampleInfo): mode = self.option.mode result = bundle(output=bundle(), script=bundle()) # extend program path self.init.multiUploader = self.expath('init.multiUploader') self.init.gzUploader = self.expath('init.gzUploader') self.init.check_log = self.expath('init.check_log') self.init.bgzip = self.expath('init.bgzip', False) self.init.samtools = self.expath('init.samtools', False) print self.init.gzUploader sampleName = self.option.multiSampleName scriptsdir = impl.mkdir(self.gaeaScriptsDir, sampleName) self.analysisList = self.analysisList[1:] hdfs_gz_tmp = os.path.join(self.option.dirHDFS, sampleName, 'data', 'gz_tmp') rawData = os.path.join(self.option.dirHDFS, sampleName, 'fq') DataParam = [] output = bundle() cmd = [] for sample_name in sampleInfo.keys(): sample = sampleInfo[sample_name] output[sample_name] = bundle() for dataTag in sample.keys(): output[sample_name][dataTag] = bundle() pathTup = impl.splitext(sample[dataTag]['fq1']) filename = '{}_{}_{}'.format(sample_name, dataTag, pathTup[0]) DataParam.append({ "KEY": sample[dataTag]['fq1'], "VALUE": rawData, "VALUE2": filename }) output[sample_name][dataTag]['fq1'] = os.path.join(rawData, filename) if self.init.isSE == False: pathTup = impl.splitext(sample[dataTag]['fq2']) filename = '{}_{}_{}'.format(sample_name, dataTag, pathTup[0]) DataParam.append({ "KEY": sample[dataTag]['fq2'], "VALUE": rawData, "VALUE2": filename }) output[sample_name][dataTag]['fq2'] = os.path.join(rawData, filename) if DataParam: impl.write_file( fileName='data.list', scriptsdir=scriptsdir, commands=["${KEY}\t${VALUE}\t${VALUE2}"], JobParamList=DataParam) mapper = [] mapper.append("#!/usr/bin/perl -w") mapper.append("use strict;\n") mapper.append("while(<STDIN>)\n{") mapper.append("\tchomp;\n\tmy @tmp = split(/\\t/);") mapper.append("\tif(!-e $tmp[1])\n\t{") mapper.append("\t\tprint \"$tmp[1] don't exist.\\n\";") mapper.append("\t\texit 1;\n\t}") mapper.append("\tsystem(\"%s jar %s GzUploader -i $tmp[1] -o $tmp[2] -n $tmp[3]\");\n}" % (self.hadoop.bin, self.init.gzUploader)) impl.write_file( fileName='upload_mapper.pl', scriptsdir=scriptsdir, commands=mapper) hadoop_parameter = ' -D mapred.job.name="upload data" ' if self.hadoop.get('queue'): hadoop_parameter += '-D mapreduce.job.queuename={} '.format(self.hadoop.queue) hadoop_parameter += ' -D mapred.map.tasks=%d ' % len(DataParam) hadoop_parameter += ' -D mapred.reduce.tasks=0 ' hadoop_parameter += ' -inputformat org.apache.hadoop.mapred.lib.NLineInputFormat ' ParamDict = { "PROGRAM": "%s jar %s" % (self.hadoop.bin, self.hadoop.streamingJar), "MAPPER": os.path.join(scriptsdir, 'upload_mapper.pl'), "INPUT": 'file://' + os.path.join(scriptsdir, 'data.list'), "OUTPUT": hdfs_gz_tmp, "HADOOPPARAM": hadoop_parameter } cmd.append('%s ${OUTPUT}' % self.fs_cmd.delete) # cmd.append('${PROGRAM} ${HADOOPPARAM} -input ${INPUT} -output ${OUTPUT} -mapper "perl ${MAPPER}"') cmd.append('%s jar %s GzUploader -i %s -l' % (self.hadoop.bin, self.init.gzUploader, os.path.join(scriptsdir, 'data.list'))) # write script scriptPath = \ impl.write_shell( name='init', scriptsdir=scriptsdir, commands=cmd, paramDict=ParamDict) result.script[sampleName] = scriptPath result.output = output if self.init.qualitysystem == '': self.check_qs(sampleInfo) print "[INFO ] -- qualitysystem is %s (autocheck)--" % self.init.qualitysystem else: print "[INFO ] -- qualitysystem is %s --" % self.init.qualitysystem return result