def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-a", "--assembly", metavar="FILE", help="FILE containing the list of all contigs and scaffolds to import", action="store", type="string", dest="assembly") parser.add_option("-r", "--root", metavar="PATH", help="PATH to the root of the hierarchy", action="store", type="string", dest="root") parser.add_option("-c", "--category", metavar="CATEGORY", help="name of the category from %s" % constants.CATEGORY, action="store", choices=constants.CATEGORY, dest="category") (options, args) = parser.parse_args() if not (options.assembly and options.root and options.category): parser.print_help() sys.exit() # check root path if not os.path.exists(options.root): log.error("%s path do not exist" % options.root) log.error("Create root path first, then run pipeline before importing assembly files.") sys.exit() # check input assembly file and read it - one line per run (lane) util.checkFile(options.assembly) assembly_lines = open(options.assembly, "r").readlines() # compare project name - could have more than one run per project previous_project = "" is_new_project = True for line in assembly_lines: if line[0] == '!': continue if not line.count('||') == 6: continue line = line.strip() values = line.split('||') project = values[0] genus = values[1] species = values[2] assembly_id = values[3] contig_file = values[4] scaffold_file = values[5] run = values[6] # get lane hierarchy path (run) run_path = util.runProcess("/software/bin/perl /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/get_lane_hierarchy_path.pl --lane=%s --db=%s" % (run, constants.DATABASE[options.category])).strip() fastq_file = "%s/%s/seq-pipelines/%s" % (options.root, options.category, run_path) # check if new project if project == previous_project: is_new_project = False else: previous_project = project is_new_project = True # check species path species_path = "%s/%s/seq-pipelines/%s/%s" % (options.root, options.category, genus, species) if not os.path.exists(species_path): log.error("%s path do not exist" % species_path) log.error("Run fastq import pipeline before importing assembly files.") else: # create assembly path assembly_path = "%s/ASSEMBLY" % species_path if not os.path.exists(assembly_path): os.makedirs(assembly_path) log.info("%s created" % assembly_path) else: log.info("%s path already exists" % assembly_path) # create assembly_id path (newbler_2009_06_29) assembly_id_path = "%s/%s" % (assembly_path, assembly_id) if not os.path.exists(assembly_id_path): os.makedirs(assembly_id_path) log.info("%s created" % assembly_id_path) else: log.info("%s path already exists" % assembly_id_path) # copy contigs file contig_file_hierarchy = "%s/LargeContigs.fna" % assembly_id_path util.checkFile(contig_file) cmd_cp = "cp %s %s" % (contig_file, contig_file_hierarchy) if not os.path.exists(contig_file_hierarchy): util.runProcess(cmd_cp) if not has_same_md5(contig_file, contig_file_hierarchy): log.error("Copied file %s is not the same as original file %s" (contig_file, contig_file_hierarchy)) else: log.info("%s file already exists" % contig_file_hierarchy) if not has_same_md5(contig_file, contig_file_hierarchy): log.error("Copied file %s is not the same as original file %s" (contig_file, contig_file_hierarchy)) # copy scaffolds file scaffold_file_hierarchy = "%s/Scaffolds.fna" % assembly_id_path util.checkFile(scaffold_file) cmd_cp = "cp %s %s" % (scaffold_file, scaffold_file_hierarchy) if not os.path.exists(scaffold_file_hierarchy): util.runProcess(cmd_cp) if not has_same_md5(scaffold_file, scaffold_file_hierarchy): log.error("Copied file %s is not the same as original file %s" (scaffold_file, scaffold_file_hierarchy)) else: log.info("%s file already exists" % scaffold_file_hierarchy) if not has_same_md5(scaffold_file, scaffold_file_hierarchy): log.error("Copied file %s is not the same as original file %s" (scaffold_file, scaffold_file_hierarchy)) # create fastqs path fastqs_path = "%s/fastqs" % assembly_id_path if not os.path.exists(fastqs_path): os.makedirs(fastqs_path) log.info("%s created" % fastqs_path) else: log.info("%s path already exists" % fastqs_path) # create simlinks to fastqs util.checkDir(fastq_file) fastq_name = run symlink = "%s/%s" % (fastqs_path, fastq_name) if not os.path.exists(symlink): os.symlink(fastq_file, symlink) log.info("%s symlink created" % symlink) else: log.info("%s symlink already exists" % symlink) # run samtools faidx, refstats, bwa to generate extra files required for the QC pipeline cmd = "" if not os.path.exists("%s.fai" % scaffold_file_hierarchy): cmd = "samtools faidx %s; " % scaffold_file_hierarchy else: log.info("%s.fai already exists" % scaffold_file_hierarchy) if not os.path.exists("%s.refstats" % scaffold_file_hierarchy): cmd = cmd + "ref-stats -r %s > %s.refstats; " % (scaffold_file_hierarchy, scaffold_file_hierarchy) else: log.info("%s.refstats already exists" % scaffold_file_hierarchy) if not os.path.exists("%s.bwt" % scaffold_file_hierarchy): cmd = cmd + "bwa index %s; " % scaffold_file_hierarchy else: log.info("%s.bwt already exists" % scaffold_file_hierarchy) # run stats.py on contigs and scaffolds cmd_stats = "python /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/stats.py -f %s; " if not os.path.exists("%s.stats" % contig_file_hierarchy): cmd = cmd + cmd_stats % contig_file_hierarchy else: log.info("%s.stats already exists" % contig_file_hierarchy) if not os.path.exists("%s.stats" % scaffold_file_hierarchy): cmd = cmd + cmd_stats % scaffold_file_hierarchy else: log.info("%s.stats already exists" % scaffold_file_hierarchy) # submit all jobs if is_new_project and not cmd == "": util.submitJob(jobname='stats_%s_%s' % (project, assembly_id), cmd=cmd, outdir=assembly_id_path)
def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-l", metavar="FILE", help="FILE containing the list of all 454 runs and its associated information", action="store", type="string", dest="list") parser.add_option("-o", "--outpath", metavar="PATH", help="PATH where to generate indexes and temporary fastq files", action="store", type="string", dest="outpath") parser.add_option("--fastq", help="Do generate fastq files.", action="store_true", dest="fastq") parser.add_option("--md5", help="Do run md5sum on generated fastq files.", action="store_true", dest="md5") (options, args) = parser.parse_args() if not options.list and not options.outpath: parser.print_help() sys.exit() # input file input_file = options.list util.checkFile(input_file) input_lines = open(input_file, "r").readlines() # output path output_path = options.outpath util.checkDir(output_path) out_metadata = "%s/metadata" % output_path util.checkDir(out_metadata) out_fastq = "%s/fastq" % output_path util.checkDir(out_fastq) # checking file format first before processing it lines = [] for line in input_lines: if line[0] == '!': continue elif not line.count('||') == 8: log.error("line is not well formated. Please check your input file.") log.error(line.count('||')) log.error(line) sys.exit() else: lines.append(line) log.debug(lines) # opening output files sequence_index_filename = '%s/sequence.index' % out_metadata sequence_index = open(sequence_index_filename, 'w') samples_info = open('%s/samples.info' % out_metadata, 'w') assembly_index = open('%s/assembly.index' % out_metadata, 'w') # processing input file sample_count = 0 for line in lines: line = line.strip() values = line.split('||') log.info(line) genus = values[1] species = values[2] strain = values[3] organism_name = "%s %s %s" % (genus, species, strain) sample_count = sample_count + 1 run = values[4] if values[5] == '1': paired = 'PAIRED' else: paired = 'SINGLE' insert_size = values[6] sff_file = "/nfs/%s" % values[7] trim_status = "/nfs/%s/454TrimStatus.txt" % values[8] contigs_file = "/nfs/%s/454LargeContigs.fna" %values[8] scaffolds_file = "/nfs/%s/454Scaffolds.fna" %values[8] species_strain = sub('[^a-zA-Z0-9_]', '_', "%s_%s" % (species, strain)).replace('__', '_') strain_4_hierarchy = sub('[^a-zA-Z0-9_]', '_', strain).replace('__', '_') study = "%s_%s%s" % (values[0], genus[0], species_strain) # check that project name (study) is less than 40 char # mysql> desc project; # | name | varchar(40) | NO | MUL | | | # | hierarchy_name | varchar(40) | NO | MUL | | | if len(study) > 40: log.warning("Project name %s has more than 40 char." % study) # checking files util.checkFile(sff_file) util.checkFile(trim_status) util.checkFile(contigs_file) util.checkFile(scaffolds_file) # convert sff into fastq outprefix = "%s/%s" % (out_fastq, run) cmd_sff2fastq = "/nfs/users/nfs_m/mh12/svn-repository/pathogen/user/mh12/python/454TrimStatus2reads.py --pair_suffix=/1,/2 --sff %s %s %s" % (sff_file, trim_status, outprefix) fastq_pairs = "%s-pairs.fastq" % outprefix fastq_single = "%s-single.fastq" % outprefix # split fastq pairs file fastq_1 = "%s_1.fastq" % outprefix fastq_2 = "%s_2.fastq" % outprefix cmd_splitfastq = "/nfs/users/nfs_m/mh12/svn-repository/pathogen/user/mh12/python/fastn_unshuffle.py %s %s %s" % (fastq_pairs, fastq_1, fastq_2) # rename fastq single file fastq_0 = "%s.fastq" % outprefix cmd_rename = "mv %s %s" % (fastq_single, fastq_0) # tidy-up cmd_remove = "rm %s-info.txt; rm %s-pairs.fastq" % (outprefix, outprefix) # gzip fastq files cmd_gzip = "gzip %s; gzip %s; gzip %s" % (fastq_1, fastq_2, fastq_0) # all commands cmd = "%s; %s; %s; %s; %s" % (cmd_sff2fastq, cmd_splitfastq, cmd_rename, cmd_remove, cmd_gzip) if IS_LSF: if not (os.path.exists("%s.gz" % fastq_1) and os.path.exists("%s.gz" % fastq_2) and os.path.exists("%s.gz" % fastq_0)): if options.fastq: util.submitJob(jobname='sff2fastq_%s' % run, cmd=cmd, outdir=out_metadata) else: log.info("fastq files do not exist, use '--fastq' to generate them.") else: log.info("fastq files already exist.") else: log.info("Need to be run on LSF.") instrument_platform = '454' empty = 'n/a' fastq_1_gz = "%s.gz" % fastq_1 fastq_2_gz = "%s.gz" % fastq_2 fastq_0_gz = "%s.gz" % fastq_0 # write to output files # sequence.index: fastq_file|md5|run_id|study_id|(study_name)|center_name|(submission_id)|(submission_date)|sample_id|sample_name| # (population)|(experiment_id)|instrument_platform|(instrument_model)|library_name|(run_name)|(run_block_name)| # insert_size|(library_layout)|paired_fastq|withdrawn|(withdrawn_date)|(comment)|read_count|base_count sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fastq_1_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform, empty, run, empty, empty, insert_size, 'PAIRED', fastq_2_gz, '0', empty, empty, '0', '0')) sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fastq_2_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform, empty, run, empty, empty, insert_size, 'PAIRED', fastq_1_gz, '0', empty, empty, '0', '0')) sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fastq_0_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform, empty, run, empty, empty, insert_size, 'SINGLE', '', '0', empty, empty, '0', '0')) # samples.info: lookup_name|acc|individual_name|alias|population_name|species_name|taxon_id|sex samples_info.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (strain, strain, strain, strain, strain, organism_name, empty, empty)) # assembly.index: genus||species_strain||assembly_id||contig||scaffold||fastq # /pyrodata01/assemblies/Ruminococcus/obeum/A2162/P_2009_07_12_22_16_23_runAssembly/454TrimStatus.txt assembly_id = "newbler_%s" % trim_status.split('/P_')[1][:10] assembly_index.write("%s||%s||%s||%s||%s||%s||%s\n" % (study, genus, species_strain, assembly_id, contigs_file, scaffolds_file, run)) # close files sequence_index.close() samples_info.close() assembly_index.close() if not options.fastq: log.info("Use '--fastq' for generating fastq files") if options.md5: # calculate md5 and modify sequence.index util.checkFile(sequence_index_filename) seq_lines = open(sequence_index_filename, "r").readlines() sequence_index = open(sequence_index_filename, 'w') for line in seq_lines: values = line.split('\t') fastq = values[0] run = values[2] if os.path.exists(fastq): md5 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq).strip() line = line.replace('md5', md5) sequence_index.write(line) else: log.info("fastq file %s does not exist, use '--fastq' for generating it." % fastq) # close file sequence_index.close() else: log.info("When all submitted jobs end, use '--md5' for updating sequence.index with md5sum.")