Beispiel #1
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-a", "--assembly", metavar="FILE", help="FILE containing the list of all contigs and scaffolds to import", action="store", type="string", dest="assembly")
    parser.add_option("-r", "--root", metavar="PATH", help="PATH to the root of the hierarchy", action="store", type="string", dest="root")
    parser.add_option("-c", "--category", metavar="CATEGORY", help="name of the category from %s" % constants.CATEGORY, action="store", choices=constants.CATEGORY, dest="category")
    
    (options, args) = parser.parse_args()

    if not (options.assembly and options.root and options.category):
        parser.print_help()
        sys.exit()

    # check root path
    if not os.path.exists(options.root):
        log.error("%s path do not exist" % options.root)
        log.error("Create root path first, then run pipeline before importing assembly files.")
        sys.exit()
    
    # check input assembly file and read it - one line per run (lane)
    util.checkFile(options.assembly)
    assembly_lines = open(options.assembly, "r").readlines()
    # compare project name - could have more than one run per project
    previous_project = ""
    is_new_project = True
    for line in assembly_lines:
        if line[0] == '!':
            continue
        if not line.count('||') == 6:
            continue
        line = line.strip()
        values = line.split('||')
        project = values[0]
        genus = values[1]
        species = values[2]
        assembly_id = values[3]
        contig_file = values[4]
        scaffold_file = values[5]
        run = values[6]

        # get lane hierarchy path (run)
        run_path = util.runProcess("/software/bin/perl /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/get_lane_hierarchy_path.pl --lane=%s --db=%s" % (run, constants.DATABASE[options.category])).strip()
        fastq_file = "%s/%s/seq-pipelines/%s" % (options.root, options.category, run_path)

        # check if new project
        if project == previous_project:
            is_new_project = False
        else:
            previous_project = project
            is_new_project = True

        # check species path
        species_path = "%s/%s/seq-pipelines/%s/%s" % (options.root, options.category, genus, species)
        if not os.path.exists(species_path):
            log.error("%s path do not exist" % species_path)
            log.error("Run fastq import pipeline before importing assembly files.")
        else:
            # create assembly path
            assembly_path = "%s/ASSEMBLY" % species_path
            if not os.path.exists(assembly_path):
                os.makedirs(assembly_path)
                log.info("%s created" % assembly_path)
            else:
                log.info("%s path already exists" % assembly_path)

            # create assembly_id path (newbler_2009_06_29)
            assembly_id_path = "%s/%s" % (assembly_path, assembly_id)
            if not os.path.exists(assembly_id_path):
                os.makedirs(assembly_id_path)
                log.info("%s created" % assembly_id_path)
            else:
                log.info("%s path already exists" % assembly_id_path)

            # copy contigs file
            contig_file_hierarchy = "%s/LargeContigs.fna" % assembly_id_path
            util.checkFile(contig_file)
            cmd_cp = "cp %s %s" % (contig_file, contig_file_hierarchy)
            if not os.path.exists(contig_file_hierarchy):
                util.runProcess(cmd_cp)
                if not has_same_md5(contig_file, contig_file_hierarchy):
                    log.error("Copied file %s is not the same as original file %s" (contig_file, contig_file_hierarchy))
            else:
                log.info("%s file already exists" % contig_file_hierarchy)
                if not has_same_md5(contig_file, contig_file_hierarchy):
                    log.error("Copied file %s is not the same as original file %s" (contig_file, contig_file_hierarchy))

            # copy scaffolds file
            scaffold_file_hierarchy = "%s/Scaffolds.fna" % assembly_id_path
            util.checkFile(scaffold_file)
            cmd_cp = "cp %s %s" % (scaffold_file, scaffold_file_hierarchy)
            if not os.path.exists(scaffold_file_hierarchy):
                util.runProcess(cmd_cp)
                if not has_same_md5(scaffold_file, scaffold_file_hierarchy):
                    log.error("Copied file %s is not the same as original file %s" (scaffold_file, scaffold_file_hierarchy))
            else:
                log.info("%s file already exists" % scaffold_file_hierarchy)
                if not has_same_md5(scaffold_file, scaffold_file_hierarchy):
                    log.error("Copied file %s is not the same as original file %s" (scaffold_file, scaffold_file_hierarchy))

            # create fastqs path
            fastqs_path = "%s/fastqs" % assembly_id_path
            if not os.path.exists(fastqs_path):
                os.makedirs(fastqs_path)
                log.info("%s created" % fastqs_path)
            else:
                log.info("%s path already exists" % fastqs_path)
            
            # create simlinks to fastqs
            util.checkDir(fastq_file)
            fastq_name = run
            symlink = "%s/%s" % (fastqs_path, fastq_name)
            if not os.path.exists(symlink):
                os.symlink(fastq_file, symlink)
                log.info("%s symlink created" % symlink)
            else:
                log.info("%s symlink already exists" % symlink)
                
            # run samtools faidx, refstats, bwa to generate extra files required for the QC pipeline
            cmd = ""
            if not os.path.exists("%s.fai" % scaffold_file_hierarchy):
                cmd = "samtools faidx %s; " % scaffold_file_hierarchy
            else:
                log.info("%s.fai already exists" % scaffold_file_hierarchy)
            
            if not os.path.exists("%s.refstats" % scaffold_file_hierarchy):
                cmd = cmd + "ref-stats -r %s > %s.refstats; " % (scaffold_file_hierarchy, scaffold_file_hierarchy)
            else:
                log.info("%s.refstats already exists" % scaffold_file_hierarchy)
            
            if not os.path.exists("%s.bwt" % scaffold_file_hierarchy):
                cmd = cmd + "bwa index %s; " % scaffold_file_hierarchy
            else:
                log.info("%s.bwt already exists" % scaffold_file_hierarchy)

            # run stats.py on contigs and scaffolds
            cmd_stats = "python /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/stats.py -f %s; "
            if not os.path.exists("%s.stats" % contig_file_hierarchy):
                cmd = cmd + cmd_stats % contig_file_hierarchy
            else:
                log.info("%s.stats already exists" % contig_file_hierarchy)
            if not os.path.exists("%s.stats" % scaffold_file_hierarchy):
                cmd = cmd + cmd_stats % scaffold_file_hierarchy
            else:
                log.info("%s.stats already exists" % scaffold_file_hierarchy)

            # submit all jobs
            if is_new_project and not cmd == "":
                util.submitJob(jobname='stats_%s_%s' % (project, assembly_id), cmd=cmd, outdir=assembly_id_path)
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-l", metavar="FILE", help="FILE containing the list of all 454 runs and its associated information", action="store", type="string", dest="list")
    parser.add_option("-o", "--outpath", metavar="PATH", help="PATH where to generate indexes and temporary fastq files", action="store", type="string", dest="outpath")
    parser.add_option("--fastq", help="Do generate fastq files.", action="store_true", dest="fastq")
    parser.add_option("--md5", help="Do run md5sum on generated fastq files.", action="store_true", dest="md5")
    
    (options, args) = parser.parse_args()

    if not options.list and not options.outpath:
        parser.print_help()
        sys.exit()
    
    # input file
    input_file = options.list
    util.checkFile(input_file)
    input_lines = open(input_file, "r").readlines()

    # output path
    output_path = options.outpath
    util.checkDir(output_path)
    out_metadata = "%s/metadata" % output_path
    util.checkDir(out_metadata)
    out_fastq = "%s/fastq" % output_path
    util.checkDir(out_fastq)

    # checking file format first before processing it
    lines = []
    for line in input_lines:
        if line[0] == '!':
            continue
        elif not line.count('||') == 8:
            log.error("line is not well formated. Please check your input file.")
            log.error(line.count('||'))
            log.error(line)
            sys.exit()
        else:
            lines.append(line)
    log.debug(lines)

    # opening output files
    sequence_index_filename = '%s/sequence.index' % out_metadata
    sequence_index = open(sequence_index_filename, 'w')
    samples_info = open('%s/samples.info' % out_metadata, 'w')
    assembly_index = open('%s/assembly.index' % out_metadata, 'w')

    # processing input file
    sample_count = 0
    for line in lines:
        line = line.strip()
        values = line.split('||')
        log.info(line)
        genus = values[1]
        species = values[2]
        strain = values[3]
        organism_name = "%s %s %s" % (genus, species, strain)
        sample_count = sample_count + 1
        run = values[4]
        if values[5] == '1':
            paired = 'PAIRED'
        else:
            paired = 'SINGLE'
        insert_size = values[6]
        sff_file = "/nfs/%s" % values[7]
        trim_status = "/nfs/%s/454TrimStatus.txt" % values[8]
        contigs_file = "/nfs/%s/454LargeContigs.fna" %values[8]
        scaffolds_file = "/nfs/%s/454Scaffolds.fna" %values[8]

        species_strain = sub('[^a-zA-Z0-9_]', '_', "%s_%s" % (species, strain)).replace('__', '_')
        strain_4_hierarchy = sub('[^a-zA-Z0-9_]', '_', strain).replace('__', '_')
        study = "%s_%s%s" % (values[0], genus[0], species_strain)

        # check that project name (study) is less than 40 char
        # mysql> desc project;
        # | name           | varchar(40)           | NO   | MUL |         |                | 
        # | hierarchy_name | varchar(40)           | NO   | MUL |         |                | 
        if len(study) > 40:
            log.warning("Project name %s has more than 40 char." % study)

        # checking files
        util.checkFile(sff_file)
        util.checkFile(trim_status)
        util.checkFile(contigs_file)
        util.checkFile(scaffolds_file)

        # convert sff into fastq
        outprefix = "%s/%s" % (out_fastq, run)
        cmd_sff2fastq = "/nfs/users/nfs_m/mh12/svn-repository/pathogen/user/mh12/python/454TrimStatus2reads.py --pair_suffix=/1,/2 --sff %s %s %s" % (sff_file, trim_status, outprefix)
        fastq_pairs = "%s-pairs.fastq" % outprefix
        fastq_single = "%s-single.fastq" % outprefix

        # split fastq pairs file
        fastq_1 = "%s_1.fastq" % outprefix
        fastq_2 = "%s_2.fastq" % outprefix
        cmd_splitfastq = "/nfs/users/nfs_m/mh12/svn-repository/pathogen/user/mh12/python/fastn_unshuffle.py %s %s %s" % (fastq_pairs, fastq_1, fastq_2)

        # rename fastq single file
        fastq_0 = "%s.fastq" % outprefix
        cmd_rename = "mv %s %s" % (fastq_single, fastq_0)

        # tidy-up
        cmd_remove = "rm %s-info.txt; rm %s-pairs.fastq" % (outprefix, outprefix)

        # gzip fastq files
        cmd_gzip = "gzip %s; gzip %s; gzip %s" % (fastq_1, fastq_2, fastq_0)

        # all commands
        cmd = "%s; %s; %s; %s; %s" % (cmd_sff2fastq, cmd_splitfastq, cmd_rename, cmd_remove, cmd_gzip)


        if IS_LSF:
            if not (os.path.exists("%s.gz" % fastq_1) and os.path.exists("%s.gz" % fastq_2) and os.path.exists("%s.gz" % fastq_0)):
                if options.fastq:
                    util.submitJob(jobname='sff2fastq_%s' % run, cmd=cmd, outdir=out_metadata)
                else:
                    log.info("fastq files do not exist, use '--fastq' to generate them.")
            else:
                log.info("fastq files already exist.")
        else:
            log.info("Need to be run on LSF.")

        instrument_platform = '454'
        empty = 'n/a'
        fastq_1_gz = "%s.gz" % fastq_1
        fastq_2_gz = "%s.gz" % fastq_2
        fastq_0_gz = "%s.gz" % fastq_0

        # write to output files
        # sequence.index: fastq_file|md5|run_id|study_id|(study_name)|center_name|(submission_id)|(submission_date)|sample_id|sample_name|
        #                 (population)|(experiment_id)|instrument_platform|(instrument_model)|library_name|(run_name)|(run_block_name)|
        #                 insert_size|(library_layout)|paired_fastq|withdrawn|(withdrawn_date)|(comment)|read_count|base_count
        sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                             % (fastq_1_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform,
                                empty, run, empty, empty, insert_size, 'PAIRED', fastq_2_gz, '0', empty, empty, '0', '0'))
        sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                             % (fastq_2_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform,
                                empty, run, empty, empty, insert_size, 'PAIRED', fastq_1_gz, '0', empty, empty, '0', '0'))
        sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                             % (fastq_0_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform,
                                empty, run, empty, empty, insert_size, 'SINGLE', '', '0', empty, empty, '0', '0'))

        # samples.info: lookup_name|acc|individual_name|alias|population_name|species_name|taxon_id|sex
        samples_info.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (strain, strain, strain, strain, strain, organism_name, empty, empty))

        # assembly.index: genus||species_strain||assembly_id||contig||scaffold||fastq
        # /pyrodata01/assemblies/Ruminococcus/obeum/A2162/P_2009_07_12_22_16_23_runAssembly/454TrimStatus.txt
        assembly_id = "newbler_%s" % trim_status.split('/P_')[1][:10]
        assembly_index.write("%s||%s||%s||%s||%s||%s||%s\n" % (study, genus, species_strain, assembly_id, contigs_file, scaffolds_file, run))

    # close files
    sequence_index.close()
    samples_info.close()
    assembly_index.close()

    if not options.fastq:
        log.info("Use '--fastq' for generating fastq files")

    if options.md5:
        # calculate md5 and modify sequence.index
        util.checkFile(sequence_index_filename)
        seq_lines = open(sequence_index_filename, "r").readlines()
        sequence_index = open(sequence_index_filename, 'w')
        for line in seq_lines:
            values = line.split('\t')
            fastq = values[0]
            run = values[2]
            if os.path.exists(fastq):
                md5 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq).strip()
                line = line.replace('md5', md5)
                sequence_index.write(line)
            else:
                log.info("fastq file %s does not exist, use '--fastq' for generating it." % fastq)

        # close file
        sequence_index.close()
    else:
        log.info("When all submitted jobs end, use '--md5' for updating sequence.index with md5sum.")