def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-r", "--root", metavar="PATH", help="PATH to the root of the hierarchy", action="store", type="string", dest="root") parser.add_option("-c", "--category", metavar="CATEGORY", help="name of the category from %s" % constants.CATEGORY, action="store", choices=constants.CATEGORY, dest="category") (options, args) = parser.parse_args() if not (options.root and options.category): parser.print_help() sys.exit() # get the data from Goodgle spreadsheet lines = gdocs.getValues(doc='%s_454_Projects' % options.category.title()) # check output path util.checkDir(options.root) out_metadata = "%s/.tmp/metadata/%s" % (options.root, options.category) util.checkDir(out_metadata) out_fastq = "%s/.tmp/fastq/%s" % (options.root, options.category) util.checkDir(out_fastq) # open output files sequence_index = open('%s/sequence.index' % out_metadata, 'w') samples_info = open('%s/samples.info' % out_metadata, 'w') assembly_index = open('%s/assembly.index' % out_metadata, 'w') # process input data sample_count = 0 for line in lines: line = line.strip() values = line.split('||') genus = values[1] species = values[2] strain = values[3] organism_name = "%s %s %s" % (genus, species, strain) sample_count = sample_count + 1 sample = values[4] if sample == 'None': sample = strain library = values[5] run = values[6] if library == 'None': library = run if values[7] == '1': paired = 'PAIRED' else: paired = 'SINGLE' log.error("Single read set for %s. Not implemented." % run) continue insert_size = values[8] sff_file = "/nfs/%s" % values[9] trim_status = "/nfs/%s/454TrimStatus.txt" % values[10] contigs_file = "/nfs/%s/454LargeContigs.fna" %values[10] scaffolds_file = "/nfs/%s/454Scaffolds.fna" %values[10] species_strain = sub('[^a-zA-Z0-9_]', '_', "%s_%s" % (species, strain)).replace('__', '_') strain_4_hierarchy = sub('[^a-zA-Z0-9_]', '_', strain).replace('__', '_') study = "%s_%s%s" % (values[0], genus[0], species_strain) instrument_platform = '454' empty = 'n/a' fastq_1_gz = "%s/%s_1.fastq.gz" % (out_fastq, run) fastq_2_gz = "%s/%s_2.fastq.gz" % (out_fastq, run) fastq_0_gz = "%s/%s.fastq.gz" % (out_fastq, run) # check that project name (study) is less than 40 char # mysql> desc project; # | name | varchar(40) | NO | MUL | | | # | hierarchy_name | varchar(40) | NO | MUL | | | if len(study) > 40: log.warning("Project name %s has more than 40 char." % study) # checking files util.checkFile(sff_file) util.checkFile(trim_status) util.checkFile(contigs_file) util.checkFile(scaffolds_file) log.info("> checking fastq files: %s" % run) # get lane hierarchy path (run) run_path = util.runProcess("/software/bin/perl /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/get_lane_hierarchy_path.pl --lane=%s --db=%s" % (run, constants.DATABASE[options.category])).strip() # check if fastq files have been loaded in db do_generate_indexes = False do_generate_assembly_indexes = False if run_path != "undefined": log.info(" loaded in db.") fastq_path = "%s/%s/seq-pipelines/%s" % (options.root, options.category, run_path) util.checkDir(fastq_path) # check if fastq files have been imported into the hierarchy if not (os.path.exists("%s/%s_1.fastq.gz" % (fastq_path, run)) and os.path.exists("%s/%s_2.fastq.gz" % (fastq_path, run)) and os.path.exists("%s/%s.fastq.gz" % (fastq_path, run))): log.info(" not imported into hierarchy.") # check if fastq files have been generated from sff into tmp dir if not (os.path.exists(fastq_1_gz) and os.path.exists(fastq_2_gz) and os.path.exists(fastq_0_gz)): log.info(" not generated from sff files.") else: log.info(" generate indexes.") do_generate_indexes = True else: log.info(" already imported into hierarchy.") do_generate_assembly_indexes = True else: log.info(" not loaded in db.") # check if fastq files have been generated from sff into tmp dir if not (os.path.exists(fastq_1_gz) and os.path.exists(fastq_2_gz) and os.path.exists(fastq_0_gz)): log.info(" not generated from sff files.") else: log.info(" generate indexes.") do_generate_indexes = True # generate sequence and sample indexes if do_generate_indexes: # calculate md5 md5_1 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq_1_gz).strip() md5_2 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq_2_gz).strip() md5_0 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq_0_gz).strip() # write to output files # sequence.index: fastq_file|md5|run_id|study_id|(study_name)|center_name|(submission_id)|(submission_date)|sample_id|sample_name| # (population)|(experiment_id)|instrument_platform|(instrument_model)|library_name|(run_name)|(run_block_name)| # insert_size|(library_layout)|paired_fastq|withdrawn|(withdrawn_date)|(comment)|read_count|base_count sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fastq_1_gz, md5_1, run, study, study, 'SC', empty, empty, sample, sample, strain, empty, instrument_platform, empty, library, empty, empty, insert_size, 'PAIRED', fastq_2_gz, '0', empty, empty, '0', '0')) sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fastq_2_gz, md5_2, run, study, study, 'SC', empty, empty, sample, sample, strain, empty, instrument_platform, empty, library, empty, empty, insert_size, 'PAIRED', fastq_1_gz, '0', empty, empty, '0', '0')) sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fastq_0_gz, md5_0, run, study, study, 'SC', empty, empty, sample, sample, strain, empty, instrument_platform, empty, library, empty, empty, insert_size, 'SINGLE', '', '0', empty, empty, '0', '0')) # samples.info: lookup_name|acc|individual_name|alias|population_name|species_name|taxon_id|sex samples_info.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (strain, strain, strain, strain, strain, organism_name, empty, empty)) # generate assembly indexes if do_generate_indexes or do_generate_assembly_indexes: # assembly.index: genus||species_strain||assembly_id||contig||scaffold||fastq # /pyrodata01/assemblies/Ruminococcus/obeum/A2162/P_2009_07_12_22_16_23_runAssembly/454TrimStatus.txt assembly_id = "newbler_%s" % trim_status.split('/P_')[1][:10] assembly_index.write("%s||%s||%s||%s||%s||%s||%s\n" % (study, genus, species_strain, assembly_id, contigs_file, scaffolds_file, run)) # close files sequence_index.close() samples_info.close() assembly_index.close()