def main():
    # setup logging
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")
    # command line parsing
    parser = argparse.ArgumentParser()
    parser.add_argument("--mode", dest="mode", choices=['htseq', 'cufflinks'],
                        default='htseq')
    parser.add_argument("--dryrun", dest="dryrun", action="store_true", 
                        default=False)
    parser.add_argument("--keep-tmp", dest="keep_tmp", action="store_true", 
                        default=None)
    parser.add_argument('-o', '--output-dir', dest="output_dir", default=None)
    parser.add_argument("config_xml_file")
    parser.add_argument('library_table')
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    if not os.path.exists(args.config_xml_file):
        parser.error("config xml file '%s' not found" % (args.config_xml_file))
    if not os.path.exists(args.library_table):
        parser.error("library table file '%s' not found" % (args.library_table))
    if not os.path.exists(args.gtf_file):
        parser.error("gtf file '%s' not found" % (args.gtf_file))
    gtf_file = os.path.abspath(args.gtf_file)
    # read configuration
    logging.info("Reading configuration file")
    config = Config.from_xml(args.config_xml_file)
    if args.output_dir is not None:
        config.output_dir = os.path.abspath(args.output_dir)
    else:
        config.output_dir = os.getcwd()
    config.keep_tmp = args.keep_tmp
    config.dryrun = args.dryrun
    # create output directory
    if not os.path.exists(config.output_dir):
        os.makedirs(config.output_dir)
    # read library table
    logging.info("Parsing library table")
    num_libs = 0
    for lib in Library.from_file(args.library_table):
        if not os.path.exists(lib.bam_file):
            logging.warning("\t[SKIPPED] Library %s BAM file not found" % (lib.library_id))
            continue
        # setup library dir
        lib_dir = os.path.join(config.output_dir, lib.library_id)
        if not os.path.exists(lib_dir):
            os.makedirs(lib_dir)
        if args.mode == 'htseq':
            htseq_count(lib, gtf_file, config)
        elif args.mode == 'cufflinks':
            cufflinks(lib, gtf_file, config)
        num_libs += 1
    logging.info("Found %d libraries" % (num_libs))
    logging.info("Done")
    return 0
Ejemplo n.º 2
0
def main():
    # setup logging
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")
    # command line parsing
    parser = argparse.ArgumentParser()
    parser.add_argument("--mode",
                        dest="mode",
                        choices=['htseq', 'cufflinks'],
                        default='htseq')
    parser.add_argument("--dryrun",
                        dest="dryrun",
                        action="store_true",
                        default=False)
    parser.add_argument("--keep-tmp",
                        dest="keep_tmp",
                        action="store_true",
                        default=None)
    parser.add_argument('-o', '--output-dir', dest="output_dir", default=None)
    parser.add_argument("config_xml_file")
    parser.add_argument('library_table')
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    if not os.path.exists(args.config_xml_file):
        parser.error("config xml file '%s' not found" % (args.config_xml_file))
    if not os.path.exists(args.library_table):
        parser.error("library table file '%s' not found" %
                     (args.library_table))
    if not os.path.exists(args.gtf_file):
        parser.error("gtf file '%s' not found" % (args.gtf_file))
    gtf_file = os.path.abspath(args.gtf_file)
    # read configuration
    logging.info("Reading configuration file")
    config = Config.from_xml(args.config_xml_file)
    if args.output_dir is not None:
        config.output_dir = os.path.abspath(args.output_dir)
    else:
        config.output_dir = os.getcwd()
    config.keep_tmp = args.keep_tmp
    config.dryrun = args.dryrun
    # create output directory
    if not os.path.exists(config.output_dir):
        os.makedirs(config.output_dir)
    # read library table
    logging.info("Parsing library table")
    num_libs = 0
    for lib in Library.from_file(args.library_table):
        if not os.path.exists(lib.bam_file):
            logging.warning("\t[SKIPPED] Library %s BAM file not found" %
                            (lib.library_id))
            continue
        # setup library dir
        lib_dir = os.path.join(config.output_dir, lib.library_id)
        if not os.path.exists(lib_dir):
            os.makedirs(lib_dir)
        if args.mode == 'htseq':
            htseq_count(lib, gtf_file, config)
        elif args.mode == 'cufflinks':
            cufflinks(lib, gtf_file, config)
        num_libs += 1
    logging.info("Found %d libraries" % (num_libs))
    logging.info("Done")
    return 0
def main():
    # setup logging
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument('--min-transcript-length', type=int, 
                        dest="min_transcript_length",
                        metavar="N",
                        default=config.MIN_TRANSCRIPT_LENGTH,
                        help="Skip ab initio transcripts equal to or below "
                        "this length [default=%(default)s]")
    parser.add_argument("--gtf-score-attr", dest="gtf_score_attr", 
                        default="FPKM", metavar="ATTR",
                        help="GTF attribute field containing transcript "
                        "expression [default='%(default)s']")
    parser.add_argument('-o', '--output-dir', dest="output_dir", 
                        default="assemblyline_out",
                        help="directory to store assemblyline results and "
                        "intermediate files [default=%(default)s]")
    parser.add_argument("--random-test-frac", dest="random_test_frac", 
                        default=0.1, metavar="FRAC", type=float,
                        help="if no user-defined tests are specified "
                        "using '--tests' randomly designate a fraction "
                        "of reference transcripts as test data for use "
                        "in classification [default=%(default)s]")
    parser.add_argument("--tests", dest="test_file", default=None,
                        help="(optional) text file containing "
                        "reference 'gene_id' attributes "
                        "(one per line) that define test cases "
                        "to use for validation purposes")
    parser.add_argument('ref_gtf_file')
    parser.add_argument('library_table_file')
    args = parser.parse_args()
    # check command line parameters
    if not os.path.exists(args.library_table_file):
        parser.error("library table file %s not found" % (args.library_table_file))
    if args.min_transcript_length < 0:
        parser.error("min_transcript_length < 0")
    if not os.path.exists(args.ref_gtf_file):
        parser.error("reference GTF file %s not found" % (args.ref_gtf_file))
    if (args.test_file is not None) and (not os.path.exists(args.test_file)):
        parser.error("test file %s not found" % (args.test_file))
    if (args.random_test_frac < 0):
        parser.error("cannot set --random-test-frac < 0")
    # show parameters
    logging.info("Parameters:")
    logging.info("min transcript length: %d" % (args.min_transcript_length))
    logging.info("gtf score attr:        %s" % (args.gtf_score_attr))
    logging.info("output directory:      %s" % (args.output_dir))
    logging.info("reference GTF file:    %s" % (args.ref_gtf_file))
    logging.info("test file:             %s" % (args.test_file))
    logging.info("library table file:    %s" % (args.library_table_file))
    logging.info("----------------------------------")
    # setup results
    results = config.AssemblylineResults(args.output_dir)
    # create output directory
    if not os.path.exists(results.run_dir):
        logging.debug("Creating output directory '%s'" % (results.run_dir))
        os.makedirs(results.run_dir)
    if not os.path.exists(results.tmp_dir):
        logging.info("Creating tmp directory '%s'" % (results.tmp_dir))
        os.makedirs(results.tmp_dir)
    # parse sample table
    logging.info("Parsing library table")
    libraries = []
    valid = True
    library_num = 1
    sample_num = 1
    sample_id_map = {}
    library_map_fileh = open(results.library_id_map, 'w')
    sample_map_fileh = open(results.sample_id_map, 'w')
    for library in Library.from_file(args.library_table_file):
        # exclude samples
        if not os.path.exists(library.gtf_file):
            logging.warning("Library '%s' GTF file not found" % (library.library_id)) 
            continue
        # rename library id
        new_library_id = "L%d" % (library_num)
        print >>library_map_fileh, '\t'.join([new_library_id, library.library_id]) 
        library_num += 1
        library.library_id = new_library_id
        # rename sample id
        if library.sample_id not in sample_id_map:
            new_sample_id = "S%d" % (sample_num)
            print >>sample_map_fileh, '\t'.join([new_sample_id, library.sample_id]) 
            sample_id_map[library.sample_id] = new_sample_id
            sample_num += 1
        else:
            new_sample_id = sample_id_map[library.sample_id]
        library.sample_id = new_sample_id
        libraries.append(library)
    if not valid:
        logging.warning("Invalid libraries in table file")
    library_map_fileh.close()
    sample_map_fileh.close()
    # setup output files
    tmp_file = os.path.join(results.tmp_dir, "transcripts.unsorted.gtf")
    tmpfileh = open(tmp_file, "w")
    dropfileh = open(results.transcripts_dropped_gtf_file, "w")
    statsfileh = open(results.transcript_stats_file, 'w')
    header_fields = ['#library_id']
    header_fields.extend(config.TRANSCRIPT_STATS_FIELDS)
    header_fields.extend([("failed_q%d" % x) for x in config.TRANSCRIPT_SCORE_QUANTILES])
    header_fields.extend([("passed_q%d" % x) for x in config.TRANSCRIPT_SCORE_QUANTILES])
    print >>statsfileh, '\t'.join(header_fields)
    # read test transcripts
    test_gene_ids = set()
    if args.test_file is not None:
        fileh = open(args.test_file)
        test_gene_ids.update(line.strip() for line in fileh)
        fileh.close()
        logging.info("Read %d test genes" % len(test_gene_ids))
    # read reference GTF file and aggregate
    logging.info("Adding reference GTF file")
    add_reference_gtf_file(args.ref_gtf_file, test_gene_ids, 
                           args.random_test_frac, tmpfileh)
    # parse sample table
    logging.info("Adding libraries")
    for library in libraries:
        t_dict = read_gtf_file(library, args.gtf_score_attr)
        logging.debug("Read %s transcripts from file %s" % (len(t_dict), 
                                                            library.gtf_file))
        if len(t_dict) == 0:
            logging.warning("Library %s has no transcripts" % 
                            (library.library_id))
        else:
            filter_transcripts(library.library_id, t_dict, 
                               tmpfileh, dropfileh, statsfileh, 
                               args.min_transcript_length)
    statsfileh.close()
    tmpfileh.close()
    logging.info("Sorting GTF")
    retcode = sort_gtf(tmp_file, results.transcripts_gtf_file, tmp_dir=results.tmp_dir)
    if retcode != 0:
        logging.error("sort GTF failed")
        if os.path.exists(results.transcripts_gtf_file):
            os.remove(results.transcripts_gtf_file)
    os.remove(tmp_file)
    logging.info("Done")
    return retcode