Exemple #1
0
def buildAnnotDB(db, chr2gtf_lines, db_name, use_gene_name):

    for chr in chr2gtf_lines:
        # Dictionary to hold information before creating exon entries.
        # Dictionaries will be of the form {transcript_id: [gff_obj]}
        five_utr_dict = {}
        three_utr_dict = {}
        start_codon_dict = {}
        stop_codon_dict = {}
        cds_dict = {}
        exon_dict = {}

        # Holds transcript and strand information separately
        # {transcript_id: strand}
        transcript_id2strand = {}

        for line in chr2gtf_lines[chr]:
            gff_obj = GFF_Record(line)

            transcript_id = get_transcript_id(gff_obj) 
        
            # Update transcript_id2strand dictionary
            if not transcript_id in transcript_id2strand:
                transcript_id2strand[transcript_id] = gff_obj.strand

            # Update all feature dictionaries
#           if gff_obj.feature == '5UTR':
#               updateDictOfLists(five_utr_dict, transcript_id, gff_obj)
#           elif gff_obj.feature == '3UTR':     
#               updateDictOfLists(three_utr_dict, transcript_id, gff_obj)
#           elif gff_obj.feature == 'start_codon':     
#               updateDictOfLists(start_codon_dict, transcript_id, gff_obj)
#           elif gff_obj.feature == 'stop_codon':     
#               updateDictOfLists(stop_codon_dict, transcript_id, gff_obj)
#           elif gff_obj.feature == 'CDS':     
#               updateDictOfLists(cds_dict, transcript_id, gff_obj)
            if gff_obj.feature == 'exon':
                updateDictOfLists(exon_dict, transcript_id, gff_obj)
            else:
                print "Not using feature %s" % (gff_obj.feature)


        buildExonTable(db, chr, transcript_id2strand,
                       five_utr_dict,
                       three_utr_dict,
                       start_codon_dict,
                       stop_codon_dict,
                       cds_dict,
                       exon_dict, db_name, use_gene_name)

#       insertFeature(db, "cds", cds_dict, db_name)
#       insertFeature(db, "start_codon", start_codon_dict, db_name)
#       insertFeature(db, "stop_codon", stop_codon_dict, db_name)
#       insertFeature(db, "five_utr", five_utr_dict, db_name)
#       insertFeature(db, "three_utr", three_utr_dict, db_name)

        buildGeneTable(db, db_name, chr)       

        inferIntrons(db, db_name, chr)
Exemple #2
0
def buildAnnotDB(db, chr2gtf_lines, db_name, use_gene_name):

    for chr in chr2gtf_lines:
        # Dictionary to hold information before creating exon entries.
        # Dictionaries will be of the form {transcript_id: [gff_obj]}
        five_utr_dict = {}
        three_utr_dict = {}
        start_codon_dict = {}
        stop_codon_dict = {}
        cds_dict = {}
        exon_dict = {}

        # Holds transcript and strand information separately
        # {transcript_id: strand}
        transcript_id2strand = {}

        for line in chr2gtf_lines[chr]:
            gff_obj = GFF_Record(line)

            if gff_obj.feature != 'exon':
                continue

            transcript_id = get_transcript_id(gff_obj)

            # Update transcript_id2strand dictionary
            if not transcript_id in transcript_id2strand:
                transcript_id2strand[transcript_id] = gff_obj.strand

            # Update all feature dictionaries


#           if gff_obj.feature == '5UTR':
#               updateDictOfLists(five_utr_dict, transcript_id, gff_obj)
#           elif gff_obj.feature == '3UTR':
#               updateDictOfLists(three_utr_dict, transcript_id, gff_obj)
#           elif gff_obj.feature == 'start_codon':
#               updateDictOfLists(start_codon_dict, transcript_id, gff_obj)
#           elif gff_obj.feature == 'stop_codon':
#               updateDictOfLists(stop_codon_dict, transcript_id, gff_obj)
#           elif gff_obj.feature == 'CDS':
#               updateDictOfLists(cds_dict, transcript_id, gff_obj)
            if gff_obj.feature == 'exon':
                updateDictOfLists(exon_dict, transcript_id, gff_obj)
            else:
                print "Not using feature %s" % (gff_obj.feature)

        buildExonTable(db, chr, transcript_id2strand, five_utr_dict,
                       three_utr_dict, start_codon_dict, stop_codon_dict,
                       cds_dict, exon_dict, db_name, use_gene_name)

        #       insertFeature(db, "cds", cds_dict, db_name)
        #       insertFeature(db, "start_codon", start_codon_dict, db_name)
        #       insertFeature(db, "stop_codon", stop_codon_dict, db_name)
        #       insertFeature(db, "five_utr", five_utr_dict, db_name)
        #       insertFeature(db, "three_utr", three_utr_dict, db_name)

        buildGeneTable(db, db_name, chr)

        inferIntrons(db, db_name, chr)
def make_paired_end_ie_junctions2qname(all_introns, input_dir, read_lengths,
                                       overhang, samp, paired_read_set):

    ie2qname_file_name = input_dir + samp + "/" + samp + "_paired_end_ie_junctions2qname.txt"
    ie2qname_file = open(ie2qname_file_name, "w")

    ie2qnames = {}

    for read_len in read_lengths:
        # Set of region coords that will be searched within the region and read
        # association file
        region_coord2ie = getTheseRegionCoords(all_introns, read_len, overhang)

        confident_ie_file_name = input_dir + samp + "/tmp_" + samp + "_" + repr(
            read_len) + "_confident_ie.txt"
        confident_ie_file = open(confident_ie_file_name)

        confident_ie_set = set([])
        for line in confident_ie_file:
            confident_ie_set.add(formatLine(line))
        confident_ie_file.close()

        coords_w_reads_file_name = input_dir + samp + "/tmp_" + samp + "_" + repr(
            read_len) + "_intron_exon_junction_coords_w_read.out"
        coords_w_reads_file = open(coords_w_reads_file_name)

        for line in coords_w_reads_file:
            line = formatLine(line)
            line_list = line.split("\t")

            if line_list[-1] in region_coord2ie:
                ie = region_coord2ie[line_list[-1]]
                if ie in confident_ie_set:
                    updateDictOfLists(ie2qnames, ie, line_list[0])

        coords_w_reads_file.close()

    for ie in ie2qnames:
        outline = "%s\t%s\n" % (ie, ",".join(ie2qnames[ie]))
        ie2qname_file.write(outline)

    ie2qname_file.close()
def make_paired_end_ie_junctions2qname(all_introns, input_dir, read_lengths, overhang, samp, paired_read_set):

    ie2qname_file_name = input_dir + samp + "/" + samp + "_paired_end_ie_junctions2qname.txt"
    ie2qname_file = open(ie2qname_file_name, "w")

    ie2qnames = {}

    for read_len in read_lengths:
        # Set of region coords that will be searched within the region and read
        # association file
        region_coord2ie = getTheseRegionCoords(all_introns, read_len,
                                                overhang) 

        confident_ie_file_name = input_dir + samp + "/tmp_" + samp + "_" + repr(read_len) + "_confident_ie.txt"
        confident_ie_file = open(confident_ie_file_name)

        confident_ie_set = set([])
        for line in confident_ie_file:
            confident_ie_set.add(formatLine(line))
        confident_ie_file.close()

        coords_w_reads_file_name = input_dir + samp + "/tmp_" + samp + "_" + repr(read_len) + "_intron_exon_junction_coords_w_read.out"
        coords_w_reads_file = open(coords_w_reads_file_name)
        
        for line in coords_w_reads_file:
            line = formatLine(line)
            line_list = line.split("\t")

            if line_list[-1] in region_coord2ie:
                ie = region_coord2ie[line_list[-1]]
                if ie in confident_ie_set:
                    updateDictOfLists(ie2qnames, ie, line_list[0])                

        coords_w_reads_file.close()           
   
    for ie in ie2qnames:
        outline = "%s\t%s\n" % (ie,
                              ",".join(ie2qnames[ie]))     
        ie2qname_file.write(outline)

    ie2qname_file.close() 
Exemple #5
0
def getGTFLines(gtf_file):
    """
    Returns a dictionary associating chromosome to the gtf line
    {chr:[lines,]}
    """
    file = open(gtf_file)

    chr2gtf_lines = {}

    for line in file:
        # Remove comments
        if line.startswith("#"):
            continue
    
        line = formatLine(line)

        chr = line.split("\t")[0] 

        updateDictOfLists(chr2gtf_lines, chr, line)

    file.close()

    return chr2gtf_lines
Exemple #6
0
def getGTFLines(gtf_file):
    """
    Returns a dictionary associating chromosome to the gtf line
    {chr:[lines,]}
    """
    file = open(gtf_file)

    chr2gtf_lines = {}

    for line in file:
        # Remove comments
        if line.startswith("#"):
            continue

        line = formatLine(line)

        chr = line.split("\t")[0]

        updateDictOfLists(chr2gtf_lines, chr, line)

    file.close()

    return chr2gtf_lines
Exemple #7
0
def main():

    opt_parser = OptionParser()

    # Add Options. Required options should have default=None
    opt_parser.add_option("-i",
                          dest="intron_coords",
                          type="string",
                          help="""File of intron coordinates.  Format:
                                  type, chr, strand, start, end""",
                          default=None)
    opt_parser.add_option("-b",
                          dest="bed_intron_coords",
                          type="string",
                          help="BED file of intron coordinates.",
                          default=None)
    opt_parser.add_option("-a",
                          dest="read_alignments",
                          type="string",
                          help="""File of alignments to genome. 
                                  Format:
                                  chr, start, strand""",
                          default=None)
    opt_parser.add_option("-f",
                          dest="flanking_dist",
                          type="int",
                          help="""Distance away from exon intron junction to
                                  check for reads in.""",
                          default=None)
    opt_parser.add_option("-o",
                          dest="offsets",
                          type="int",
                          help="""Minimum number of offsets required at each
                                  exon/intron junction. Default=1""",
                          default=1)
    opt_parser.add_option("-l",
                          dest="read_length",
                          type="int",
                          help="Length of the reads.",
                          default=1)
    opt_parser.add_option("--out_dir",
                          dest="out_dir",
                          type="string",
                          help="Output files are put here.",
                          default=None)
    opt_parser.add_option("--out_prefix",
                          dest="prefix",
                          type="string",
                          help="Prefix attached to all output files.",
                          default=None)

    (options, args) = opt_parser.parse_args()

    # validate the command line arguments
    opt_parser.check_required("-a")
    opt_parser.check_required("-f")
    opt_parser.check_required("-l")
    opt_parser.check_required("--out_dir")
    opt_parser.check_required("--out_prefix")

    # Check that the COUNTING_SCRIPT path is valid
    if not os.path.exists(COUNTING_SCRIPT):
        print("Please change COUNTING_SCRIPT path.")
        opt_parser.print_help()
        sys.exit(1)

    if options.intron_coords and options.bed_intron_coords:
        print("Only one type of intron coord can be used as input.")
        opt_parser.print_help()
        sys.exit(1)

    if (not options.intron_coords) and (not options.bed_intron_coords):
        print(" Need to specify intron coordinates. See options -i or -b")
        opt_parser.print_help()
        sys.exit(1)

    intron_coords = None
    isBedFormat = False
    if options.intron_coords:
        intron_coords = open(options.intron_coords)
    if options.bed_intron_coords:
        intron_coords = open(options.bed_intron_coords)
        isBedFormat = True

    read_alignments = options.read_alignments

    read_length = options.read_length

    flanking_dist = options.flanking_dist
    offsets = options.offsets

    prefix = options.prefix
    out_dir = options.out_dir

    if not out_dir.endswith("/"):
        out_dir += "/"

    if not os.path.exists(out_dir):
        print("Output directory does not exist")
        sys.exit(1)

    # Intermediate Output Files
    out_coords_file = out_dir + prefix + "_intron_exon_junction_coords.out"
    out_coords = open(out_coords_file, "w")

    out_read_assoc_file = out_dir + prefix + "_intron_exon_junction_coords_w_read.out"

    # Final output
    out_file_name = out_dir + prefix + "_intron_exon_junction_counts.txt"
    out_file = open(out_file_name, "w")

    confident_ie_name = out_dir + prefix + "_confident_ie.txt"
    confident_ie_file = open(confident_ie_name, "w")

    # {intron_coord: {"left": (chr, start, end),
    #                 "right": (chr, start, end)}
    # "left" and "right" being the region at the left or right side of the
    # junction, around the exon/intron junction
    # The dict is the above but reverse mapping
    left_region_coord2intron = {}
    right_region_coord2intron = {}

    # {intron_coord_str:{"left":{pos:count},
    #                    "right":{pos:count}}
    intron_dict = {}

    regions_set = set([])

    for line in intron_coords:
        line = formatLine(line)

        if isBedFormat:
            if line.startswith("track"):
                continue
            chr, start_str, end_str = parseBEDLine(line)
        else:
            type, chr, strand, start_str, end_str = line.split("\t")

        if chr.startswith("chr"):
            chr = chr.replace("chr", "")

        intron_coord_str = "%s:%s-%s" % (chr, start_str, end_str)

        if intron_coord_str not in intron_dict:
            intron_dict[intron_coord_str] = {"left": {}, "right": {}}

        start = int(start_str)
        end = int(end_str)

        left_coord = (chr, start - flanking_dist, start + flanking_dist - 1)

        right_coord = (chr, end - flanking_dist + 1, end + flanking_dist)

        updateDictOfLists(left_region_coord2intron, left_coord,
                          intron_coord_str)
        updateDictOfLists(right_region_coord2intron, right_coord,
                          intron_coord_str)

        regions_set.add(left_coord)
        regions_set.add(right_coord)

    # Print out regions out_coords
    for region_coord in regions_set:

        out_line = "%s\t%d\t%d\n" % (region_coord[0], region_coord[1],
                                     region_coord[2])

        out_coords.write(out_line)

    out_coords.close()

    # Used to make unique name for tmp file in case a shared directory is being
    # used for runs.
    rand_num = random.randrange(1, 100000)

    # Get Read Counts
    print("Getting Counts in Region")
    cmd = "python %s --reads %s -l %d --coords %s -o %stmp%d.txt --read_assoc %s" % (
        COUNTING_SCRIPT, read_alignments, read_length, out_coords_file,
        out_dir, rand_num, out_read_assoc_file)
    print(cmd)
    #    runCmd(cmd, SHELL)
    os.system(cmd)

    # Remove the tmp file
    #    runCmd("rm %stmp%d.txt" % (out_dir, rand_num), SHELL)
    os.system("rm %stmp%d.txt" % (out_dir, rand_num))

    print("Getting Left and Right Counts")
    # Parse read_assoc_file to get information
    read_assoc_file = open(out_read_assoc_file)

    for line in read_assoc_file:
        line = formatLine(line)

        line_list = line.split("\t")

        read_start, read_end = getReadStartEnd(line_list[1])

        region_coord = getRegionCoord(line_list[2])
        intron_coord_list = getIntronStartEnds(left_region_coord2intron,
                                               right_region_coord2intron,
                                               region_coord)

        if region_coord in left_region_coord2intron:
            for intron_str in left_region_coord2intron[region_coord]:
                # Put in left dictionaries
                if read_end not in intron_dict[intron_str]["left"]:
                    intron_dict[intron_str]["left"][read_end] = 1
                else:
                    intron_dict[intron_str]["left"][read_end] += 1

        if region_coord in right_region_coord2intron:
            for intron_str in right_region_coord2intron[region_coord]:
                # Check right dictionary
                if read_end not in intron_dict[intron_str]["right"]:
                    intron_dict[intron_str]["right"][read_end] = 1
                else:
                    intron_dict[intron_str]["right"][read_end] += 1

    # Print output
    confident_ie_set = set([])
    for intron_str in intron_dict:
        #       chr, intron_start_str, intron_end_str = intron_str.split("_")
        #       intron_start = int(intron_start_str)
        #       intron_end = int(intron_end_str)
        chr, intron_start, intron_end = convertCoordStr(intron_str)

        # Get left_counts
        if len(intron_dict[intron_str]["left"]) >= offsets:
            left_count = getTotalCounts(intron_dict[intron_str]["left"])
            confident_ie = "%s:%d-%d" % (chr, intron_start - 1, intron_start)
            confident_ie_set.add(confident_ie)
        else:
            left_count = 0

        # Get right counts
        if len(intron_dict[intron_str]["right"]) >= offsets:
            right_count = getTotalCounts(intron_dict[intron_str]["right"])
            confident_ie = "%s:%d-%d" % (chr, intron_end, intron_end + 1)
            confident_ie_set.add(confident_ie)
        else:
            right_count = 0

        if left_count == 0 and right_count == 0:
            continue

        print_line = "%s\t%d\t%d\n" % (intron_str, left_count, right_count)

        out_file.write(print_line)

    # Now print out confident set of ie
    for ie in confident_ie_set:
        confident_ie_file.write("%s\n" % ie)

    confident_ie_file.close()

    sys.exit(0)
def main():
	
    opt_parser = OptionParser()
   
    # Add Options. Required options should have default=None
    opt_parser.add_option("--initialize",
                          dest="initialize",
                          action="store_true",
                          help="""Will split up the gtf file into separate temp files
                                  and initalize the database.""",
                          default=False)
    opt_parser.add_option("--tmp_dir",
                          dest="tmp_dir",
                          type="string",
                          help="""Directory to place temporary files and to look
                                  for temporary files.""",
                          default=None)
    opt_parser.add_option("--keep_temp",
                          dest="keep_temp",
                          action="store_true",
                          help="""TEMP FILES ARE KEPT FOR NOW. Will keep the temporary gtf files. Default is
                                  to delete them.""",
                          default=False)
    opt_parser.add_option("-g",
                          dest="gtf_file",
                          type="string",
                          help="GTF annotation file.",
                          default=None)
    opt_parser.add_option("--use_gene_name",
                          dest="use_gene_name",
                          action="store_true",
                          help="""By default, the gene_id attribute will be used
                                  for the gene name used in the database, but
                                  the gene_name attribute can be used
                                  instead.""",
                          default=False)
    # May revisit this option, but do not need now
#   opt_parser.add_option("-f",
#                         dest="genome_file_name",
#                         type="string",
#                         help="""Fasta file containing all chromosome
#                                 sequences.  If this option is given, exon and
#                                 intron sequences will be stored in the
#                                 database as well. Chromosome names must be the
#                                 same format as in the gtf file.""",
#                         default=None)
    opt_parser.add_option("-d",
                          dest="db_name",
                          type="string",
                          help="Name of the new database",
                          default=None)
    opt_parser.add_option("--sqlite_db_dir",
                          dest="sqlite_db_dir",
                          type="string",
                          help="Location to put sqlite database. Default=%s" % DB_DIR,
                          default=DB_DIR)
    opt_parser.add_option("-p",
                          dest="num_processes",
                          type="int",
                          help="""Will run getASEventReadCounts.py
                                  simultaneously with this many samples.
                                  Default=%d""" % DEF_NUM_PROCESSES,
                          default=DEF_NUM_PROCESSES)
    opt_parser.add_option("--LSF",
                          dest="run_lsf",
                          action="store_true",
                          help="""Will launch jobs on LSF. Default is running on
                                  local.""",
                          default=False)
    opt_parser.add_option("--force",
                          dest="force",
                          action="store_true",
                          help="""By default, will check for the existence of
                                  the final output before running commands. This
                                  option will force all runs.""",
                          default=False)
    opt_parser.add_option("--check",
                          dest="check",
                          action="store_true",
                          help="""Will check samples that are not done and print
                                  out which need to still be run""",
                         default=False)
    opt_parser.add_option("--print_cmd",
                          dest="print_cmd",
                          action="store_true",
                          help="""Will print commands that will be run, but will
                                  not run them. Used for debugging.""",
                         default=False)


    (options, args) = opt_parser.parse_args()
	
    # validate the command line arguments
    opt_parser.check_required("-g")
    opt_parser.check_required("--tmp_dir")
    opt_parser.check_required("-d")

    gtf_file_name = options.gtf_file
    tmp_dir = formatDir(options.tmp_dir)

    db_name = options.db_name

    sqlite_db_dir = options.sqlite_db_dir

    num_processes = options.num_processes
    run_lsf = options.run_lsf

    force = options.force
    check = options.check
    print_cmd = options.print_cmd

    ##############
    # INITIALIZE #
    ##############

    # If it's initilalizing, split gtf file and initialize database return
    if options.initialize:    
        chr2lines = {}

        gtf_file_path = gtf_file_name
        gtf_file_name = gtf_file_name.split("/")[-1]
        gtf_file_comp = gtf_file_name.split(".")   
        gtf_file_prefix = ".".join(gtf_file_comp[:-1])
 
        gtf_file = open(gtf_file_path)

        for line in gtf_file:
            this_chr = line.split("\t")[0]
            updateDictOfLists(chr2lines, this_chr, line)
        gtf_file.close()

        for chr in chr2lines:
            tmp_chr_file = open("%s/%s_%s.gtf" % (tmp_dir,
                                                  gtf_file_prefix, chr),
                                "w")
            for line in chr2lines[chr]:
                tmp_chr_file.write(line)
            tmp_chr_file.close()

        # Now initialize the database
        cmd = "python %s " % SCRIPT
        cmd += "--initialize -d %s" % db_name
        os.system(cmd)
        
        sys.exit(0)

    ##################
    # BUILD DATABASE #
    ##################
    db = DB(sqlite_db_dir)

    # Use gtf file to figure out temp file names, Build the database from them
    tmp_file_list = []
    
    gtf_file_name = gtf_file_name.split("/")[-1]
    gtf_file_comp = gtf_file_name.split(".")   
    gtf_file_prefix = ".".join(gtf_file_comp[:-1])

    for this_file in os.listdir(tmp_dir):
        if gtf_file_prefix in this_file:
            if this_file == gtf_file_name:
                continue
            tmp_file_list.append(this_file)


    # Now run script for every chromosome file
    ctr = 0
    for tmp_file in tmp_file_list:

        this_chr = getChr(tmp_dir + "/" + tmp_file)

        if (not force) or check:

            # For now, just checks that records exist in the database, It is
            # better to force since it difficult to really know if a chromosome was
            # built or not.
            chr_built = checkChr(db, db_name, this_chr)
            
            if chr_built:
                if not force:
                    continue

            if check:
                if not chr_built:
                    print "Chromosome %s not built" % this_chr
                    continue

        ctr += 1

        cmd = "python %s " % SCRIPT
        cmd += "-g %s/%s " % (tmp_dir, tmp_file)
        cmd += "-d %s " % db_name

        if options.use_gene_name:
            cmd += "--use_gene_name "

        cmd += "--sqlite_db_dir %s" % sqlite_db_dir
        
        if print_cmd:
            print cmd
            continue


        if run_lsf:
            runLSF(cmd,
                   "%s.build_DB.bsub.out" % this_chr,
                   this_chr + "build_DB",
                   "hour")
            continue
        
        if ctr % num_processes == 0:                             
            os.system(cmd)                                       
        else:
            print cmd                                       
            Popen(cmd, shell=True, executable=SHELL)     
    
    # Remove temp files, but first check that exons are returned from the same
    # chromosome in the database
#    if not options.keep_temp:
			
    sys.exit(0)
def main():

    opt_parser = OptionParser()

    # Add Options. Required options should have default=None
    opt_parser.add_option(
        "--initialize",
        dest="initialize",
        action="store_true",
        help="""Will split up the gtf file into separate temp files
                                  and initalize the database.""",
        default=False)
    opt_parser.add_option(
        "--tmp_dir",
        dest="tmp_dir",
        type="string",
        help="""Directory to place temporary files and to look
                                  for temporary files.""",
        default=None)
    opt_parser.add_option(
        "--keep_temp",
        dest="keep_temp",
        action="store_true",
        help=
        """TEMP FILES ARE KEPT FOR NOW. Will keep the temporary gtf files. Default is
                                  to delete them.""",
        default=False)
    opt_parser.add_option("-g",
                          dest="gtf_file",
                          type="string",
                          help="GTF annotation file.",
                          default=None)
    opt_parser.add_option(
        "--use_gene_name",
        dest="use_gene_name",
        action="store_true",
        help="""By default, the gene_id attribute will be used
                                  for the gene name used in the database, but
                                  the gene_name attribute can be used
                                  instead.""",
        default=False)
    # May revisit this option, but do not need now
    #   opt_parser.add_option("-f",
    #                         dest="genome_file_name",
    #                         type="string",
    #                         help="""Fasta file containing all chromosome
    #                                 sequences.  If this option is given, exon and
    #                                 intron sequences will be stored in the
    #                                 database as well. Chromosome names must be the
    #                                 same format as in the gtf file.""",
    #                         default=None)
    opt_parser.add_option("-d",
                          dest="db_name",
                          type="string",
                          help="Name of the new database",
                          default=None)
    opt_parser.add_option("--sqlite_db_dir",
                          dest="sqlite_db_dir",
                          type="string",
                          help="Location to put sqlite database. Default=%s" %
                          DB_DIR,
                          default=DB_DIR)
    opt_parser.add_option("-p",
                          dest="num_processes",
                          type="int",
                          help="""Will run getASEventReadCounts.py
                                  simultaneously with this many samples.
                                  Default=%d""" % DEF_NUM_PROCESSES,
                          default=DEF_NUM_PROCESSES)
    opt_parser.add_option(
        "--LSF",
        dest="run_lsf",
        action="store_true",
        help="""Will launch jobs on LSF. Default is running on
                                  local.""",
        default=False)
    opt_parser.add_option("--force",
                          dest="force",
                          action="store_true",
                          help="""By default, will check for the existence of
                                  the final output before running commands. This
                                  option will force all runs.""",
                          default=False)
    opt_parser.add_option(
        "--check",
        dest="check",
        action="store_true",
        help="""Will check samples that are not done and print
                                  out which need to still be run""",
        default=False)
    opt_parser.add_option(
        "--print_cmd",
        dest="print_cmd",
        action="store_true",
        help="""Will print commands that will be run, but will
                                  not run them. Used for debugging.""",
        default=False)

    (options, args) = opt_parser.parse_args()

    # validate the command line arguments
    opt_parser.check_required("-g")
    opt_parser.check_required("--tmp_dir")
    opt_parser.check_required("-d")

    gtf_file_name = options.gtf_file
    tmp_dir = formatDir(options.tmp_dir)

    db_name = options.db_name

    sqlite_db_dir = options.sqlite_db_dir

    num_processes = options.num_processes
    run_lsf = options.run_lsf

    force = options.force
    check = options.check
    print_cmd = options.print_cmd

    ##############
    # INITIALIZE #
    ##############

    # If it's initilalizing, split gtf file and initialize database return
    if options.initialize:
        chr2lines = {}

        gtf_file_path = gtf_file_name
        gtf_file_name = gtf_file_name.split("/")[-1]
        gtf_file_comp = gtf_file_name.split(".")
        gtf_file_prefix = ".".join(gtf_file_comp[:-1])

        gtf_file = open(gtf_file_path)

        for line in gtf_file:
            this_chr = line.split("\t")[0]
            updateDictOfLists(chr2lines, this_chr, line)
        gtf_file.close()

        for chr in chr2lines:
            tmp_chr_file = open(
                "%s/%s_%s.gtf" % (tmp_dir, gtf_file_prefix, chr), "w")
            for line in chr2lines[chr]:
                tmp_chr_file.write(line)
            tmp_chr_file.close()

        # Now initialize the database
        cmd = "python %s " % SCRIPT
        cmd += "--initialize -d %s" % db_name
        os.system(cmd)

        sys.exit(0)

    ##################
    # BUILD DATABASE #
    ##################
    db = DB(sqlite_db_dir)

    # Use gtf file to figure out temp file names, Build the database from them
    tmp_file_list = []

    gtf_file_name = gtf_file_name.split("/")[-1]
    gtf_file_comp = gtf_file_name.split(".")
    gtf_file_prefix = ".".join(gtf_file_comp[:-1])

    for this_file in os.listdir(tmp_dir):
        if gtf_file_prefix in this_file:
            if this_file == gtf_file_name:
                continue
            tmp_file_list.append(this_file)

    # Now run script for every chromosome file
    ctr = 0
    for tmp_file in tmp_file_list:

        this_chr = getChr(tmp_dir + "/" + tmp_file)

        if (not force) or check:

            # For now, just checks that records exist in the database, It is
            # better to force since it difficult to really know if a chromosome was
            # built or not.
            chr_built = checkChr(db, db_name, this_chr)

            if chr_built:
                if not force:
                    continue

            if check:
                if not chr_built:
                    print "Chromosome %s not built" % this_chr
                    continue

        ctr += 1

        cmd = "python %s " % SCRIPT
        cmd += "-g %s/%s " % (tmp_dir, tmp_file)
        cmd += "-d %s " % db_name

        if options.use_gene_name:
            cmd += "--use_gene_name "

        cmd += "--sqlite_db_dir %s" % sqlite_db_dir

        if print_cmd:
            print cmd
            continue

        if run_lsf:
            runLSF(cmd, "%s.build_DB.bsub.out" % this_chr,
                   this_chr + "build_DB", "hour")
            continue

        if ctr % num_processes == 0:
            os.system(cmd)
        else:
            print cmd
            Popen(cmd, shell=True, executable=SHELL)

    # Remove temp files, but first check that exons are returned from the same
    # chromosome in the database
#    if not options.keep_temp:

    sys.exit(0)
Exemple #10
0
def main():
    opt_parser = OptionParser()
    # Add Options. Required options should have default=None
    opt_parser.add_option("--in_prefix",
                          dest="in_prefix",
                          type="string",
                          help="""Prefix of output files created from
                                  createAS_CountTables. In createAS_CountTables,
                                  this is the -o option""",
                          default=None)
    #   opt_parser.add_option("-i",
    #                         dest="input_file",
    #                         type="string",
    #                         help="Resulting file from clusterASExons2.py",
    #                         default=None)
    #   opt_parser.add_option("--left_intron",
    #                         dest="left_input",
    #                         type="string",
    #                         help="""Resulting file from clusterASExons2.py, which
    #                                 contains the exclusion and inclusion counts
    #                                 for just the left side of an intron retention
    #                                 event.""",
    #                         default=None)
    #   opt_parser.add_option("--right_intron",
    #                         dest="right_input",
    #                         type="string",
    #                         help="""Resulting file from clusterASExons2.py, which
    #                                 contains the exclusion and inclusion counts
    #                                 for just the right side of an intron retention
    #                                 event.""",
    #                         default=None)
    #   opt_parser.add_option("--lenNormalized_counts",
    #                         dest="lenNormalized_counts",
    #                         type="string",
    #                         help="""File containing length-normalized inclusion
    #                                 exclusion counts. Used for PSI calculation,
    #                                 not for statistcal significance.""",
    #                         default=None)
    #   opt_parser.add_option("--lenNormalized_left_intron",
    #                         dest="lenNormalized_left_intron_counts",
    #                         type="string",
    #                         help="""File containing length-normalized
    #                                 the left intron_retention counts.
    #                                 Used for PSI calculation, not for
    #                                 statistical significane.""",
    #                         default=None)
    #   opt_parser.add_option("--lenNormalized_right_intron",
    #                         dest="lenNormalized_right_intron_counts",
    #                         type="string",
    #                         help="""File containing length-normalized
    #                                 the right intron_retention counts.
    #                                 Used for PSI calculation, not for
    #                                 for statistical significance.""",
    #                         default=None)
    opt_parser.add_option("--has_virtual",
                          dest="has_virtual",
                          action="store_true",
                          help="""Gives flags that a virtual reference is being
                                  used.""",
                          default=False)
    opt_parser.add_option("--jcn_seq_len",
                          dest="jcn_seq_len",
                          type="int",
                          help="""Junction length. Used as an option in
                                  getASEventReadCounts.py""",
                          default=None)
    opt_parser.add_option("--output_dir",
                          dest="output_dir",
                          type="string",
                          help="Directory to place output files.",
                          default=None)
    opt_parser.add_option("--out_prefix",
                          dest="prefix",
                          type="string",
                          help="Prefix of all output files. DEF=None",
                          default=None)
    #   opt_parser.add_option("--psi_output_most_sign",
    #                         dest="psi_output",
    #                         type="string",
    #                         help="""Output file that will contain the PSI values
    #                                 for all events and samples that are
    #                                 signficantly spliced.""",
    #                         default=None)
    #   opt_parser.add_option("--psi_output_sign_by_samp",
    #                         dest="psi_output_by_samp",
    #                         type="string",
    #                         help="""Output file that will contain the PSI values
    #                                 for all events and samples that are
    #                                 signficantly differentially spliced where
    #                                 multiple testing is not done for all samples
    #                                 tested against the virtual reference""",
    #                         default=None)
    #   opt_parser.add_option("--all_psi_output",
    #                         dest="all_psi_output",
    #                         type="string",
    #                         help="""Output file that will contain the PSI values
    #                                 for all events and samples that pass minimum
    #                                 count thresholds""",
    #                         default=None)
    #   opt_parser.add_option("--left_intron_all_psi_output",
    #                         dest="left_intron_all_psi_output",
    #                         type="string",
    #                         help="""Output file that will contain the PSI values
    #                                 for the left side of intron retention
    #                                 samples. Not required, but used for dPSI
    #                                 thresholds when taking all splice events.""",
    #                         default=None)
    #   opt_parser.add_option("--right_intron_all_psi_output",
    #                         dest="right_intron_all_psi_output",
    #                         type="string",
    #                         help="""Output file that will contain the PSI values
    #                                 for the right side of intron retention
    #                                 samples. Not required, but used for dPSI
    #                                 thresholds when taking all splice events.""",
    #                         default=None)
    #   opt_parser.add_option("--recalculate_ref_psi",
    #                         dest="recalculate_ref_psi",
    #                         action="store_true",
    #                         help="""The reference PSI given in input tables
    #                                 should be recalculated due to changes in
    #                                 thresholding for minimum input between
    #                                 length-normalized and raw counts.""",
    #                         default=False)
    #   opt_parser.add_option("--pval_output",
    #                         dest="pval_output",
    #                         type="string",
    #                         help="""Output file that will associate the
    #                                 unadjusted and adjusted p-values for all
    #                                 pairs that were tested.""",
    #                         default=None)
    #   opt_parser.add_option("--event_sum",
    #                         dest="event_sum",
    #                         type="string",
    #                         help="""Output file that will contain the sum of the
    #                                 exclusion and inclusion counts for every
    #                                 sample that was considered signifcantly
    #                                 affected.""",
    #                         default=None)
    opt_parser.add_option("--thresh",
                          dest="threshold",
                          type="int",
                          help="""Threshold for minimum number of total reads
                                  in an event. Default=%d""" % DEF_THRESH,
                          default=DEF_THRESH)
    opt_parser.add_option("--min_dpsi_threshold",
                          dest="dpsi_threshold",
                          type="float",
                          help="""Threshold for minimum delta PSI value between
                                  the sample with the smallest and largest PSI.
                                  Events with dPSI values below the threshold
                                  will not be tested or reported. Def=%.2f""" %
                          DEF_DPSI_THRESH,
                          default=DEF_DPSI_THRESH)
    opt_parser.add_option(
        "--method",
        dest="method",
        type="string",
        help="""Correction Method: "BH" - Benjamini & Hochberg,
                                  "bonferroni".  Must select these strings as
                                  the option""",
        default=None)
    opt_parser.add_option("--sign_cutoff",
                          dest="sign_cutoff",
                          type="float",
                          help="""Cutoff of corrected p-value significance.
                                  Default=%.2f""" % DEF_SIGN_CUTOFF,
                          default=DEF_SIGN_CUTOFF)
    opt_parser.add_option("--weights",
                          dest="weights",
                          type="string",
                          help="""Comma separated list of weights given in the
                                  order of the samples in the table. Weights are
                                  used to create a weighted median. Default is
                                  equal weight for all samples.""",
                          default=None)

    (options, args) = opt_parser.parse_args()

    # validate the command line arguments
    #    opt_parser.check_required("-i")
    #    opt_parser.check_required("--psi_output_most_sign")
    #    opt_parser.check_required("--pval_output")
    #    opt_parser.check_required("--event_sum")
    opt_parser.check_required("--method")
    opt_parser.check_required("--in_prefix")
    opt_parser.check_required("--out_prefix")
    opt_parser.check_required("--jcn_seq_len")

    in_prefix = options.in_prefix
    prefix = options.prefix

    try:
        input_file = open(in_prefix + "_AS_exclusion_inclusion_counts.txt")
    except:
        print(
            ("""Cannot find expected file %s_AS_exclusion_inclusion_counts.txt.
                 Please check that the same options is given from
                 combine_createAS_CountTables""" % prefix))
        opt_parser.print_help()
        sys.exit(1)

    left_input_file_name = in_prefix + "_left_intron_counts.txt"
    right_input_file_name = in_prefix + "_right_intron_counts.txt"
    sum_thresh = options.threshold

    sign_cutoff = options.sign_cutoff

    dpsi_thresh = options.dpsi_threshold

    left_input_file = None
    right_input_file = None
    if left_input_file_name is None:
        print(
            "Warning: No intron retention file given as input.  Will not calculate IR events."
        )
    else:
        left_input_file = open(left_input_file_name)
        right_input_file = open(right_input_file_name)

    output_dir = formatDir(options.output_dir)

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    out_prefix = "%s/%s" % (output_dir, options.prefix)

    psi_out = open("%s_most_sign_PSI.txt" % out_prefix, "w")
    pval_out = open("%s_pairs_p_val.txt" % out_prefix, "w")

    has_virtual = options.has_virtual

    # Optional output files
    psi_out_by_samp = open("%s_sign_by_samp_PSI.txt" % out_prefix, "w")

    all_psi_output = open("%s_allPSI.txt" % out_prefix, "w")

    left_all_psi_output = open(
        "%s_left_intron_retention_allPSI.txt" % out_prefix, "w")

    right_all_psi_output = open(
        "%s_right_intron_retention_allPSI.txt" % out_prefix, "w")

    jcn_seq_len = options.jcn_seq_len

    recalculate_ref_psi = False
    lenNormalized_counts_event2PSIs = None
    lenNormalized_counts_event2total_counts = None
    #   if options.lenNormalized_counts:
    #       if ((not options.lenNormalized_left_intron_counts) or
    #           (not options.lenNormalized_right_intron_counts)):
    #           print "Need to specify all length-normalized count files."
    #           opt_parser.print_help()
    #           sys.exit(1)
    #
    recalculate_ref_psi = True
    lenNormalized_counts = open(in_prefix +
                                "_AS_exclusion_inclusion_counts_lenNorm.txt")
    (lenNormalized_counts_event2total_counts,
     lenNormalized_counts_event2PSIs) = buildDicts(lenNormalized_counts)
    lenNormalized_counts.close()

    left_lenNormalized_counts_event2total_counts = None
    left_lenNormalized_counts_event2PSIs = None
    #   if options.lenNormalized_left_intron_counts:
    #       if ((not options.lenNormalized_counts) or
    #           (not options.lenNormalized_right_intron_counts)):
    #           print "Need to specify all length-normalized count files."
    #           opt_parser.print_help()
    #           sys.exit(1)

    left_lenNormalized_counts = open(in_prefix +
                                     "_left_intron_counts_lenNorm.txt")
    (left_lenNormalized_counts_event2total_counts,
     left_lenNormalized_counts_event2PSIs
     ) = buildDicts(left_lenNormalized_counts)
    left_lenNormalized_counts.close()

    right_lenNormalized_counts_event2total_counts = None
    right_lenNormalized_counts_event2PSIs = None
    #   if options.lenNormalized_right_intron_counts:
    #       if ((not options.lenNormalized_counts) or
    #           (not options.lenNormalized_left_intron_counts)):
    #           print "Need to specify all length-normalized count files."
    #           opt_parser.print_help()
    #           sys.exit(1)

    right_lenNormalized_counts = open(in_prefix +
                                      "_right_intron_counts_lenNorm.txt")
    (right_lenNormalized_counts_event2total_counts,
     right_lenNormalized_counts_event2PSIs
     ) = buildDicts(right_lenNormalized_counts)
    right_lenNormalized_counts.close()

    #    if options.lenNormalized_counts:
    #       if not jcn_seq_len:
    #           print "If length normalized counts are specified, need to give jcn_seq_len"
    #           opt_parser.print_help()
    #           sys.exit(1)

    weights = None
    if options.weights:
        weights = list(map(float, options.weights.split(",")))

        # Use R limma package
        try:
            r.library("limma")
        except:
            print(
                """In order to use weighted median, please install the limma package from Bioconductor: 
                     http://www.bioconductor.org/packages/release/bioc/html/limma.html"""
            )
            print(
                """In R:\nsource("http://bioconductor.org/biocLite.R")\nbiocLite("limma")"""
            )

    event_sum = open("%s_event_sum.txt" % out_prefix, "w")

    if options.method != "BH" and options.method != "bonferroni":
        print("Wrong method indicated.")
        opt_parser.print_help()
        sys.exit(1)

    method_map = {"BH": "fdr_bh", "bonferroni": "bonferroni"}
    method = method_map[options.method]

    # {event_type:[pval]}
    event_type2pvals = {}

    # {event:(col1, col2):pval_idx}
    event2pairs2idx = {}

    # Additional pval holders tested by each sample against the reference
    # {event_type:col:[pval]}
    event_type2col2pvals = {}

    # {event:col:pval_idx}
    event2col2idx = {}

    # {event:{col:psi}}
    event2col2psi = {}

    # {event:{col:sum_counts}}
    event2col2sum = {}

    # For weighted median
    col2weights = None

    header = None
    total_samples = None
    for line in input_file:
        line = formatLine(line)

        if line.startswith("#"):
            header = line
            line_list = line.split("\t")
            samples = line_list[11:]
            total_samples = len(samples)
            if weights:
                if len(weights) != total_samples - 1:
                    print("Weights for every sample needs to be given")
                    opt_parser.print_help()
                    sys.exit(1)

                col2weights = {}
                for i in range(1, total_samples):
                    col2weights[i - 1] = weights[i - 1]
            continue

        line_list = line.split("\t")

        event = "\t".join(line_list[0:11])
        counts = line_list[11:]

        # If the reference is NA, then do not calculate anything
        if counts[0] == NA:
            continue

        if has_virtual:
            # Cannot do a comparison when virtual reference is low expressed
            if lenNormalized_counts_event2total_counts[event][0] == NA:
                continue

        lenNormalized_psis = [None for i in range(len(counts))]
        if lenNormalized_counts_event2PSIs:
            try:
                lenNormalized_psis = lenNormalized_counts_event2PSIs[event]
            except:
                print(("Warning: Can't find event in lenNormalized psis: %s" %
                       event))
                continue

        event_type = getEventType(event)
        if event_type not in event_type2pvals:
            event_type2pvals[event_type] = []
        if event_type not in event_type2col2pvals:
            event_type2col2pvals[event_type] = {}

        # Fill PSI dict
        for i in range(total_samples):
            (psi, sum_ct) = getPSI_sample_sum(counts[i], sum_thresh,
                                              lenNormalized_psis[i])
            if event in event2col2psi:
                event2col2psi[event][i] = psi
                event2col2sum[event][i] = sum_ct
            else:
                event2col2psi[event] = {i: psi}
                event2col2sum[event] = {i: sum_ct}

        # Only psis in event2col2psi that passed the sum_thresh will be
        # present, for ref psi will be calculated from the median of the
        # existing values
        if recalculate_ref_psi and has_virtual:
            adj_psi, adj_totalCount = recalculateRefPSI(
                event2col2psi[event],
                lenNormalized_counts_event2total_counts[event], col2weights)
            event2col2psi[event][0] = adj_psi
            lenNormalized_counts_event2total_counts[event][0] = adj_totalCount

        if dPSI(event2col2psi[event]) < dpsi_thresh:
            for j in range(1, total_samples):
                if event in event2pairs2idx:
                    event2pairs2idx[event][(0, j)] = NA
                else:
                    event2pairs2idx[event] = {(0, j): NA}
                if event in event2col2idx:
                    event2col2idx[event][j] = NA
                else:
                    event2col2idx[event] = {j: NA}

            continue

        # Calculate p-val for intron retention later
        if event_type == "intron_retention":
            continue

        # Do pairwise comparisons with first column
        [col1_excl, col1_incl] = list(map(int, counts[0].split(";")))
        if recalculate_ref_psi and has_virtual:
            # Need to also adjust relative counts based on new PSI
            col1_excl, col1_incl = adjustRefCounts(
                event, jcn_seq_len,
                lenNormalized_counts_event2total_counts[event][0],
                float(event2col2psi[event][0]), col1_excl, col1_incl)

        for j in range(1, total_samples):

            if j not in event_type2col2pvals[event_type]:
                event_type2col2pvals[event_type][j] = []

            [col2_excl, col2_incl] = list(map(int, counts[j].split(";")))

            # Both samples have to be non-zero
            if belowThreshold(sum_thresh, col1_excl, col1_incl, col2_excl,
                              col2_incl):
                if event in event2pairs2idx:
                    event2pairs2idx[event][(0, j)] = NA
                else:
                    event2pairs2idx[event] = {(0, j): NA}
                if event in event2col2idx:
                    event2col2idx[event][j] = NA
                else:
                    event2col2idx[event] = {j: NA}

                continue

            cur_len = len(event_type2pvals[event_type])
            cur_len2 = len(event_type2col2pvals[event_type][j])

            if event in event2pairs2idx:
                event2pairs2idx[event][(0, j)] = cur_len
            else:
                event2pairs2idx[event] = {(0, j): cur_len}

            if event in event2col2idx:
                event2col2idx[event][j] = cur_len2
            else:
                event2col2idx[event] = {j: cur_len2}

            _, raw_pval = scipy.stats.fisher_exact([[col1_excl, col1_incl],
                                                    [col2_excl, col2_incl]])

            event_type2pvals[event_type].append(raw_pval)

            updateDictOfLists(event_type2col2pvals[event_type], j, raw_pval)

    # Now calculate intron retention
    if left_input_file:
        left_events2counts = getIntronLeftRightCounts(left_input_file)
        right_events2counts = getIntronLeftRightCounts(right_input_file)
    else:
        left_events2counts = {}
        right_events2counts = {}

    if left_all_psi_output:
        left_all_psi_output.write(header + "\n")
    if right_all_psi_output:
        right_all_psi_output.write(header + "\n")

    for event in left_events2counts:
        if event not in right_events2counts:
            continue

        allPSI_elems_left = []
        allPSI_elems_right = []

        left_length = len(left_events2counts[event])
        right_length = len(right_events2counts[event])

        lenNormalized_left_psis = [None for i in range(left_length)]
        lenNormalized_right_psis = [None for i in range(right_length)]

        if left_lenNormalized_counts_event2PSIs:
            try:
                lenNormalized_left_psis = left_lenNormalized_counts_event2PSIs[
                    event]
            except:
                print((
                    "Warning: Could not find event in left_lenNormalized psis: %s"
                    % event))
                continue
        if right_lenNormalized_counts_event2PSIs:
            try:
                lenNormalized_right_psis = right_lenNormalized_counts_event2PSIs[
                    event]
            except:
                print((
                    "Warning: Could not find event in right_lenNormalized psis: %s"
                    % event))
                continue

        # Fill PSI dict
        for i in range(left_length):
            (psi, sum_ct) = getPSI_sample_sum(left_events2counts[event][i],
                                              sum_thresh,
                                              lenNormalized_left_psis[i])
            allPSI_elems_left.append(psi)

            try:
                (psi,
                 sum_ct) = getPSI_sample_sum(right_events2counts[event][i],
                                             sum_thresh,
                                             lenNormalized_right_psis[i])
            except:
                pdb.set_trace()
            allPSI_elems_right.append(psi)

#           # Adding left and right PSI values
#           if left_col2_excl + left_col2_incl < sum_thresh:
#               allPSI_elems_left.append(NA)
#           else:
#               allPSI_elems_left.append(getPSI(left_col2_excl, left_col2_incl,
#                                               lenNormalized_left_psis[j]))

#           if right_col2_excl + right_col2_incl < sum_thresh:
#               allPSI_elems_right.append(NA)
#           else:
#               allPSI_elems_right.append(getPSI(right_col2_excl,
#                                                right_col2_incl,
#                                                lenNormalized_right_psis[j]))

# Only psis in event2col2psi that passed the sum_thresh will be
# present, for ref psi will be calculated from the median of the
# existing values
        if recalculate_ref_psi and has_virtual:
            allPSI_elems_left[0] = recalculateRefPSI_list(
                allPSI_elems_left, col2weights)
            allPSI_elems_right[0] = recalculateRefPSI_list(
                allPSI_elems_right, col2weights)

        if dPSI(allPSI_elems_left) < dpsi_thresh or dPSI(
                allPSI_elems_right) < dpsi_thresh:

            for j in range(1, left_length):
                if event in event2pairs2idx:
                    event2pairs2idx[event][(0, j)] = NA
                else:
                    event2pairs2idx[event] = {(0, j): NA}
                if event in event2col2idx:
                    event2col2idx[event][j] = NA
                else:
                    event2col2idx[event] = {j: NA}

            continue

        [left_col1_excl, left_col1_incl
         ] = list(map(int, left_events2counts[event][0].split(";")))
        [right_col1_excl, right_col1_incl
         ] = list(map(int, right_events2counts[event][0].split(";")))

        if left_col1_excl + left_col1_incl < sum_thresh:
            continue  # the reference must have a PSI

        if right_col1_excl + right_col1_incl < sum_thresh:
            continue  # the reference must have a PSI

        # Adjust ref counts based on PSI
        if recalculate_ref_psi and has_virtual:
            left_col1_excl, left_col1_incl = adjustRefCounts(
                event, jcn_seq_len,
                left_lenNormalized_counts_event2total_counts[event][0],
                float(allPSI_elems_left[0]), left_col1_excl, left_col1_incl)

            right_col1_excl, right_col1_incl = adjustRefCounts(
                event, jcn_seq_len,
                right_lenNormalized_counts_event2total_counts[event][0],
                float(allPSI_elems_right[0]), right_col1_excl, right_col1_incl)

        for j in range(1, total_samples):

            [left_col2_excl, left_col2_incl
             ] = list(map(int, left_events2counts[event][j].split(";")))
            [right_col2_excl, right_col2_incl
             ] = list(map(int, right_events2counts[event][j].split(";")))

            if j not in event_type2col2pvals["intron_retention"]:
                event_type2col2pvals["intron_retention"][j] = []

            # Both samples have to be non-zero
            if (belowThreshold(sum_thresh, left_col1_excl, left_col1_incl,
                               left_col2_excl, left_col2_incl)
                    or belowThreshold(sum_thresh, right_col1_excl,
                                      right_col1_incl, right_col2_excl,
                                      right_col2_incl)):
                if event in event2pairs2idx:
                    event2pairs2idx[event][(0, j)] = NA
                else:
                    event2pairs2idx[event] = {(0, j): NA}
                if event in event2col2idx:
                    event2col2idx[event][j] = NA
                else:
                    event2col2idx[event] = {j: NA}
                continue

            cur_len = len(event_type2pvals["intron_retention"])
            cur_len2 = len(event_type2col2pvals["intron_retention"][j])

            if event in event2pairs2idx:
                event2pairs2idx[event][(0, j)] = cur_len
            else:
                event2pairs2idx[event] = {(0, j): cur_len}

            if event in event2col2idx:
                event2col2idx[event][j] = cur_len2
            else:
                event2col2idx[event] = {j: cur_len2}

            _, left_pval = scipy.stats.fisher_exact(
                [[left_col1_excl, left_col1_incl],
                 [left_col2_excl, left_col2_incl]])

            _, right_pval = scipy.stats.fisher_exact(
                [[right_col1_excl, right_col1_incl],
                 [right_col2_excl, right_col2_incl]])

            combined_pval = (left_pval + right_pval) - left_pval * right_pval

            event_type2pvals["intron_retention"].append(combined_pval)

            updateDictOfLists(event_type2col2pvals["intron_retention"], j,
                              combined_pval)

        # All samples have been processed, now print to allPSI
        if left_all_psi_output:
            left_all_psi_output.write(event + "\t" +
                                      "\t".join(allPSI_elems_left) + "\n")
        if right_all_psi_output:
            right_all_psi_output.write(event + "\t" +
                                       "\t".join(allPSI_elems_right) + "\n")

    if left_all_psi_output:
        left_all_psi_output.close()
    if right_all_psi_output:
        right_all_psi_output.close()

    # All pairs have been evaluated, so now do multiple testing correction on
    # everything
    event_type2adjusted_pvals = {}
    event_type2col2adjusted_pvals = {}

    for event_type in event_type2pvals:
        event_type2adjusted_pvals[event_type] = list(
            multitest.multipletests(event_type2pvals[event_type],
                                    method=method)[1])

    for event_type in event_type2col2pvals:
        event_type2col2adjusted_pvals[event_type] = {}
        for col in event_type2col2pvals[event_type]:
            event_type2col2adjusted_pvals[event_type][col] = list(
                multitest.multipletests(event_type2col2pvals[event_type][col],
                                        method=method)[1])

    # Now go through all events and only consider those that are signficant
    psi_out.write(header + "\n")
    if psi_out_by_samp:
        psi_out_by_samp.write(header + "\n")
    if all_psi_output:
        all_psi_output.write(header + "\n")

    for event in event2pairs2idx:
        sign_cols = set([])
        sign_cols2 = set([])
        event_type = getEventType(event)

        for pair in event2pairs2idx[event]:
            this_idx = event2pairs2idx[event][pair]
            this_idx2 = event2col2idx[event][pair[1]]
            if this_idx == NA:
                continue

            outline = "%s\t%d\t%d\t%f" % (
                event, pair[0], pair[1],
                event_type2pvals[event_type][this_idx])
            if psi_out_by_samp:
                outline += "\t%f" % event_type2col2adjusted_pvals[event_type][
                    pair[1]][this_idx2]

            outline += "\t%f\n" % event_type2adjusted_pvals[event_type][
                this_idx]
            pval_out.write(outline)

            if event_type2adjusted_pvals[event_type][this_idx] < sign_cutoff:
                sign_cols.add(pair[0])
                sign_cols.add(pair[1])

            if psi_out_by_samp:
                if event_type2col2adjusted_pvals[event_type][
                        pair[1]][this_idx2] < sign_cutoff:
                    sign_cols2.add(pair[0])
                    sign_cols2.add(pair[1])

        # Write out PSI for any significant samples
        # Significant across all samples
        if sign_cols != set([]):
            psi_vals = []
            for i in range(total_samples):
                if i in sign_cols:
                    psi_vals.append(event2col2psi[event][i])
                else:
                    psi_vals.append(NA)

            outline = "%s\t%s\n" % (event, "\t".join(psi_vals))
            psi_out.write(outline)

        # Significant by samples
        if sign_cols2 != set([]):
            psi_vals = []
            for i in range(total_samples):
                if i in sign_cols2:
                    psi_vals.append(event2col2psi[event][i])
                    if event_sum:
                        event_sum.write("%s\t%d\t%s\n" %
                                        (event, i, event2col2sum[event][i]))
                else:
                    psi_vals.append(NA)

            outline = "%s\t%s\n" % (event, "\t".join(psi_vals))
            psi_out_by_samp.write(outline)

        # Print all psi
        if all_psi_output:
            psi_vals = []
            for i in range(total_samples):
                try:
                    psi_vals.append(event2col2psi[event][i])
                except:
                    psi_vals.append(NA)

            outline = "%s\t%s\n" % (event, "\t".join(psi_vals))

            all_psi_output.write(outline)

    psi_out.close()
    psi_out_by_samp.close()
    all_psi_output.close()
    pval_out.close()

    sys.exit(0)
def main():
    opt_parser = OptionParser()
    # Add Options. Required options should have default=None
    opt_parser.add_option("--in_prefix",
                          dest="in_prefix",
                          type="string",
                          help="""Prefix of output files created from
                                  createAS_CountTables. In createAS_CountTables,
                                  this is the -o option""",
                          default=None)
#   opt_parser.add_option("-i",
#                         dest="input_file",
#                         type="string",
#                         help="Resulting file from clusterASExons2.py",
#                         default=None)
#   opt_parser.add_option("--left_intron",
#                         dest="left_input",
#                         type="string",
#                         help="""Resulting file from clusterASExons2.py, which
#                                 contains the exclusion and inclusion counts
#                                 for just the left side of an intron retention
#                                 event.""",
#                         default=None)
#   opt_parser.add_option("--right_intron",
#                         dest="right_input",
#                         type="string",
#                         help="""Resulting file from clusterASExons2.py, which
#                                 contains the exclusion and inclusion counts
#                                 for just the right side of an intron retention
#                                 event.""",
#                         default=None)
#   opt_parser.add_option("--lenNormalized_counts",
#                         dest="lenNormalized_counts",
#                         type="string",
#                         help="""File containing length-normalized inclusion
#                                 exclusion counts. Used for PSI calculation,
#                                 not for statistcal significance.""",
#                         default=None)
#   opt_parser.add_option("--lenNormalized_left_intron",
#                         dest="lenNormalized_left_intron_counts",
#                         type="string",
#                         help="""File containing length-normalized
#                                 the left intron_retention counts.
#                                 Used for PSI calculation, not for
#                                 statistical significane.""",
#                         default=None)
#   opt_parser.add_option("--lenNormalized_right_intron",
#                         dest="lenNormalized_right_intron_counts",
#                         type="string",
#                         help="""File containing length-normalized
#                                 the right intron_retention counts.
#                                 Used for PSI calculation, not for
#                                 for statistical significance.""",
#                         default=None)
    opt_parser.add_option("--has_virtual",
                          dest="has_virtual",
                          action="store_true",
                          help="""Gives flags that a virtual reference is being
                                  used.""",
                          default=False)
    opt_parser.add_option("--jcn_seq_len",
                          dest="jcn_seq_len",
                          type="int",
                          help="""Junction length. Used as an option in
                                  getASEventReadCounts.py""",
                          default=None)
    opt_parser.add_option("--output_dir",
                          dest="output_dir",
                          type="string",
                          help="Directory to place output files.",
                          default=None)
    opt_parser.add_option("--out_prefix",
                          dest="prefix",
                          type="string",
                          help="Prefix of all output files. DEF=None",
                          default=None)
#   opt_parser.add_option("--psi_output_most_sign",
#                         dest="psi_output",
#                         type="string",
#                         help="""Output file that will contain the PSI values
#                                 for all events and samples that are
#                                 signficantly spliced.""",
#                         default=None)
#   opt_parser.add_option("--psi_output_sign_by_samp",
#                         dest="psi_output_by_samp",
#                         type="string",
#                         help="""Output file that will contain the PSI values
#                                 for all events and samples that are
#                                 signficantly differentially spliced where
#                                 multiple testing is not done for all samples
#                                 tested against the virtual reference""",
#                         default=None)
#   opt_parser.add_option("--all_psi_output",
#                         dest="all_psi_output",
#                         type="string",
#                         help="""Output file that will contain the PSI values
#                                 for all events and samples that pass minimum
#                                 count thresholds""",
#                         default=None)
#   opt_parser.add_option("--left_intron_all_psi_output",
#                         dest="left_intron_all_psi_output",
#                         type="string",
#                         help="""Output file that will contain the PSI values
#                                 for the left side of intron retention
#                                 samples. Not required, but used for dPSI
#                                 thresholds when taking all splice events.""",
#                         default=None)
#   opt_parser.add_option("--right_intron_all_psi_output",
#                         dest="right_intron_all_psi_output",
#                         type="string",
#                         help="""Output file that will contain the PSI values
#                                 for the right side of intron retention
#                                 samples. Not required, but used for dPSI
#                                 thresholds when taking all splice events.""",
#                         default=None)
#   opt_parser.add_option("--recalculate_ref_psi",
#                         dest="recalculate_ref_psi",
#                         action="store_true",
#                         help="""The reference PSI given in input tables
#                                 should be recalculated due to changes in
#                                 thresholding for minimum input between
#                                 length-normalized and raw counts.""",
#                         default=False)
#   opt_parser.add_option("--pval_output",
#                         dest="pval_output",
#                         type="string",
#                         help="""Output file that will associate the
#                                 unadjusted and adjusted p-values for all
#                                 pairs that were tested.""",
#                         default=None)
#   opt_parser.add_option("--event_sum",
#                         dest="event_sum",
#                         type="string",
#                         help="""Output file that will contain the sum of the
#                                 exclusion and inclusion counts for every
#                                 sample that was considered signifcantly
#                                 affected.""",
#                         default=None)
    opt_parser.add_option("--thresh",
                          dest="threshold",
                          type="int",
                          help="""Threshold for minimum number of total reads
                                  in an event. Default=%d""" % DEF_THRESH,
                          default=DEF_THRESH)
    opt_parser.add_option("--min_dpsi_threshold",
                          dest="dpsi_threshold",
                          type="float",
                          help="""Threshold for minimum delta PSI value between
                                  the sample with the smallest and largest PSI.
                                  Events with dPSI values below the threshold
                                  will not be tested or reported. Def=%.2f""" % DEF_DPSI_THRESH,
                          default=DEF_DPSI_THRESH)
    opt_parser.add_option("--method",
                          dest="method",
                          type="string",
                          help="""Correction Method: "BH" - Benjamini & Hochberg,
                                  "bonferroni".  Must select these strings as
                                  the option""",
                          default=None)
    opt_parser.add_option("--sign_cutoff",
                          dest="sign_cutoff",
                          type="float",
                          help="""Cutoff of corrected p-value significance.
                                  Default=%.2f""" % DEF_SIGN_CUTOFF,
                          default=DEF_SIGN_CUTOFF)
    opt_parser.add_option("--weights",
                          dest="weights",
                          type="string",
                          help="""Comma separated list of weights given in the
                                  order of the samples in the table. Weights are
                                  used to create a weighted median. Default is
                                  equal weight for all samples.""",
                          default=None)

    (options, args) = opt_parser.parse_args()
	
    # validate the command line arguments
#    opt_parser.check_required("-i")
#    opt_parser.check_required("--psi_output_most_sign")
#    opt_parser.check_required("--pval_output")
#    opt_parser.check_required("--event_sum")
    opt_parser.check_required("--method")
    opt_parser.check_required("--in_prefix")
    opt_parser.check_required("--out_prefix")
    opt_parser.check_required("--jcn_seq_len")

    in_prefix = options.in_prefix
    prefix = options.prefix

    try:
        input_file = open(in_prefix + "_AS_exclusion_inclusion_counts.txt")
    except:
        print """Cannot find expected file %s_AS_exclusion_inclusion_counts.txt.
                 Please check that the same options is given from
                 combine_createAS_CountTables""" % prefix
        opt_parser.print_help()
        sys.exit(1)

    left_input_file_name = in_prefix + "_left_intron_counts.txt"
    right_input_file_name = in_prefix + "_right_intron_counts.txt"
    sum_thresh = options.threshold

    sign_cutoff = options.sign_cutoff

    dpsi_thresh = options.dpsi_threshold

    left_input_file = None
    right_input_file = None
    if left_input_file_name is None:
        print "Warning: No intron retention file given as input.  Will not calculate IR events."
    else:
        left_input_file = open(left_input_file_name)
        right_input_file = open(right_input_file_name)

    output_dir = formatDir(options.output_dir)

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    out_prefix = "%s/%s" % (output_dir, options.prefix)
    
    psi_out = open("%s_most_sign_PSI.txt" % out_prefix, "w")
    pval_out = open("%s_pairs_p_val.txt" % out_prefix, "w")

    has_virtual = options.has_virtual

    # Optional output files
    psi_out_by_samp = open("%s_sign_by_samp_PSI.txt" % out_prefix, "w")
   
    all_psi_output = open("%s_allPSI.txt" % out_prefix, "w")

    left_all_psi_output = open("%s_left_intron_retention_allPSI.txt" % out_prefix, "w")

    right_all_psi_output = open("%s_right_intron_retention_allPSI.txt" % out_prefix, "w")

    jcn_seq_len = options.jcn_seq_len

    recalculate_ref_psi = False
    lenNormalized_counts_event2PSIs = None
    lenNormalized_counts_event2total_counts = None
#   if options.lenNormalized_counts:
#       if ((not options.lenNormalized_left_intron_counts) or 
#           (not options.lenNormalized_right_intron_counts)):
#           print "Need to specify all length-normalized count files."
#           opt_parser.print_help()
#           sys.exit(1)
#   
    recalculate_ref_psi = True
    lenNormalized_counts = open(in_prefix + "_AS_exclusion_inclusion_counts_lenNorm.txt")
    (lenNormalized_counts_event2total_counts,
     lenNormalized_counts_event2PSIs) = buildDicts(lenNormalized_counts)
    lenNormalized_counts.close()

    left_lenNormalized_counts_event2total_counts = None
    left_lenNormalized_counts_event2PSIs = None
#   if options.lenNormalized_left_intron_counts:
#       if ((not options.lenNormalized_counts) or 
#           (not options.lenNormalized_right_intron_counts)):
#           print "Need to specify all length-normalized count files."
#           opt_parser.print_help()
#           sys.exit(1)

    left_lenNormalized_counts = open(in_prefix + "_left_intron_counts_lenNorm.txt")
    (left_lenNormalized_counts_event2total_counts,
     left_lenNormalized_counts_event2PSIs) = buildDicts(left_lenNormalized_counts)
    left_lenNormalized_counts.close()

    right_lenNormalized_counts_event2total_counts = None
    right_lenNormalized_counts_event2PSIs = None
#   if options.lenNormalized_right_intron_counts:
#       if ((not options.lenNormalized_counts) or 
#           (not options.lenNormalized_left_intron_counts)):
#           print "Need to specify all length-normalized count files."
#           opt_parser.print_help()
#           sys.exit(1)

    right_lenNormalized_counts = open(in_prefix + "_right_intron_counts_lenNorm.txt")
    (right_lenNormalized_counts_event2total_counts,       
     right_lenNormalized_counts_event2PSIs) = buildDicts(right_lenNormalized_counts)
    right_lenNormalized_counts.close()

#    if options.lenNormalized_counts:
#       if not jcn_seq_len:
#           print "If length normalized counts are specified, need to give jcn_seq_len"
#           opt_parser.print_help()
#           sys.exit(1)

    weights = None
    if options.weights:
        weights = map(float, options.weights.split(","))

        # Use R limma package
        try:
            r.library("limma")
        except:
            print """In order to use weighted median, please install the limma package from Bioconductor: 
                     http://www.bioconductor.org/packages/release/bioc/html/limma.html"""
            print """In R:\nsource("http://bioconductor.org/biocLite.R")\nbiocLite("limma")"""

    event_sum = open("%s_event_sum.txt" % out_prefix, "w")

    method = options.method
    if method != "BH" and method != "bonferroni":
        print "Wrong method indicated."
        opt_parser.print_help()
        sys.exit(1)

    # {event_type:[pval]}
    event_type2pvals = {}

    # {event:(col1, col2):pval_idx}
    event2pairs2idx = {} 

    # Additional pval holders tested by each sample against the reference
    # {event_type:col:[pval]}
    event_type2col2pvals = {}

    # {event:col:pval_idx}
    event2col2idx = {}

    # {event:{col:psi}}
    event2col2psi = {}

    # {event:{col:sum_counts}}
    event2col2sum = {}

    # For weighted median
    col2weights = None

    header = None
    total_samples = None
    for line in input_file:
        line = formatLine(line)

        if line.startswith("#"):
            header = line
            line_list = line.split("\t")
            samples = line_list[11:]
            total_samples = len(samples)
            if weights:
                if len(weights) != total_samples-1:
                    print "Weights for every sample needs to be given"
                    opt_parser.print_help()
                    sys.exit(1)

                col2weights = {}
                for i in range(1,total_samples):
                    col2weights[i-1] = weights[i-1]
            continue

        line_list = line.split("\t")

        event = "\t".join(line_list[0:11])
        counts = line_list[11:]

        # If the reference is NA, then do not calculate anything
        if counts[0] == NA:
            continue

        if has_virtual:
            # Cannot do a comparison when virtual reference is low expressed
            if lenNormalized_counts_event2total_counts[event][0] == NA:
                continue
        
        lenNormalized_psis = [None for i in range(len(counts))]
        if lenNormalized_counts_event2PSIs:
            try:
                lenNormalized_psis = lenNormalized_counts_event2PSIs[event]
            except:
                print "Warning: Can't find event in lenNormalized psis: %s" % event
                continue

        event_type = getEventType(event)
        if event_type not in event_type2pvals:
            event_type2pvals[event_type] = []
        if event_type not in event_type2col2pvals:
            event_type2col2pvals[event_type] = {}

        # Fill PSI dict
        for i in range(total_samples):
            (psi, sum_ct) = getPSI_sample_sum(counts[i], sum_thresh,
                                              lenNormalized_psis[i])
            if event in event2col2psi:
                event2col2psi[event][i] = psi
                event2col2sum[event][i] = sum_ct
            else:
                event2col2psi[event] = {i:psi}
                event2col2sum[event] = {i:sum_ct}

        # Only psis in event2col2psi that passed the sum_thresh will be
        # present, for ref psi will be calculated from the median of the
        # existing values
        if recalculate_ref_psi and has_virtual:
            adj_psi, adj_totalCount = recalculateRefPSI(event2col2psi[event],
                                                        lenNormalized_counts_event2total_counts[event],
                                                        col2weights)
            event2col2psi[event][0] = adj_psi
            lenNormalized_counts_event2total_counts[event][0] = adj_totalCount

        if dPSI(event2col2psi[event]) < dpsi_thresh:
            for j in range(1,total_samples):
                if event in event2pairs2idx:
                    event2pairs2idx[event][(0,j)] = NA
                else:
                    event2pairs2idx[event] = {(0,j):NA}
                if event in event2col2idx:
                    event2col2idx[event][j] = NA
                else:
                    event2col2idx[event] = {j:NA}

            continue

        # Calculate p-val for intron retention later
        if event_type == "intron_retention":
            continue

        # Do pairwise comparisons with first column
        [col1_excl, col1_incl] = map(int,counts[0].split(";"))
        if recalculate_ref_psi and has_virtual:
            # Need to also adjust relative counts based on new PSI
            col1_excl, col1_incl = adjustRefCounts(event,
                                                   jcn_seq_len,
                                                   lenNormalized_counts_event2total_counts[event][0],
                                                   float(event2col2psi[event][0]),
                                                   col1_excl,
                                                   col1_incl)

        for j in range(1,total_samples):

            if j not in event_type2col2pvals[event_type]:
                event_type2col2pvals[event_type][j] = []

            [col2_excl, col2_incl] = map(int,counts[j].split(";"))

            # Both samples have to be non-zero
            if belowThreshold(sum_thresh, col1_excl, col1_incl,
                              col2_excl, col2_incl):
                if event in event2pairs2idx:
                    event2pairs2idx[event][(0,j)] = NA
                else:
                    event2pairs2idx[event] = {(0,j):NA}
                if event in event2col2idx:
                    event2col2idx[event][j] = NA
                else:
                    event2col2idx[event] = {j:NA}

                continue

            cur_len = len(event_type2pvals[event_type])
            cur_len2 = len(event_type2col2pvals[event_type][j])

            if event in event2pairs2idx:
                event2pairs2idx[event][(0,j)] = cur_len
            else:
                event2pairs2idx[event] = {(0,j):cur_len}	

            if event in event2col2idx:
                event2col2idx[event][j] = cur_len2
            else:
                event2col2idx[event] = {j:cur_len2}

            raw_pval = robjects.r['fisher.test'](r.matrix(robjects.IntVector([col1_excl,
                                                                              col1_incl,
                                                                              col2_excl,
                                                                              col2_incl]),
                                                                              nrow=2))[0][0]

            event_type2pvals[event_type].append(raw_pval)

            updateDictOfLists(event_type2col2pvals[event_type], j, raw_pval)

    # Now calculate intron retention
    if left_input_file:
        left_events2counts = getIntronLeftRightCounts(left_input_file)
        right_events2counts = getIntronLeftRightCounts(right_input_file)
    else:
        left_events2counts = {}
        right_events2counts = {}

    if left_all_psi_output:
        left_all_psi_output.write(header + "\n")
    if right_all_psi_output:
        right_all_psi_output.write(header + "\n")

    for event in left_events2counts:
        if event not in right_events2counts:
            continue

        allPSI_elems_left = []
        allPSI_elems_right = []

        left_length = len(left_events2counts[event])
        right_length = len(right_events2counts[event])

        lenNormalized_left_psis = [None for i in range(left_length)]
        lenNormalized_right_psis = [None for i in range(right_length)]

        if left_lenNormalized_counts_event2PSIs:
            try:
                lenNormalized_left_psis = left_lenNormalized_counts_event2PSIs[event]
            except:
                print "Warning: Could not find event in left_lenNormalized psis: %s" % event
                continue
        if right_lenNormalized_counts_event2PSIs:
            try:
                lenNormalized_right_psis = right_lenNormalized_counts_event2PSIs[event]
            except:
                print "Warning: Could not find event in right_lenNormalized psis: %s" % event
                continue

        # Fill PSI dict
        for i in range(left_length):
            (psi, sum_ct) = getPSI_sample_sum(left_events2counts[event][i], sum_thresh,
                                              lenNormalized_left_psis[i])
            allPSI_elems_left.append(psi)

            try:
                (psi, sum_ct) = getPSI_sample_sum(right_events2counts[event][i], sum_thresh,
                                                  lenNormalized_right_psis[i])
            except:
                pdb.set_trace()
            allPSI_elems_right.append(psi)


#           # Adding left and right PSI values
#           if left_col2_excl + left_col2_incl < sum_thresh:
#               allPSI_elems_left.append(NA)
#           else:
#               allPSI_elems_left.append(getPSI(left_col2_excl, left_col2_incl,
#                                               lenNormalized_left_psis[j]))

#           if right_col2_excl + right_col2_incl < sum_thresh:
#               allPSI_elems_right.append(NA)
#           else:
#               allPSI_elems_right.append(getPSI(right_col2_excl,
#                                                right_col2_incl,
#                                                lenNormalized_right_psis[j]))

        # Only psis in event2col2psi that passed the sum_thresh will be
        # present, for ref psi will be calculated from the median of the
        # existing values
        if recalculate_ref_psi and has_virtual:
            allPSI_elems_left[0] = recalculateRefPSI_list(allPSI_elems_left,
                                                          col2weights)
            allPSI_elems_right[0] = recalculateRefPSI_list(allPSI_elems_right,
                                                           col2weights)

        if dPSI(allPSI_elems_left) < dpsi_thresh or dPSI(allPSI_elems_right) < dpsi_thresh:

            for j in range(1,left_length):
                if event in event2pairs2idx:
                    event2pairs2idx[event][(0,j)] = NA
                else:
                    event2pairs2idx[event] = {(0,j):NA}
                if event in event2col2idx:
                    event2col2idx[event][j] = NA
                else:
                    event2col2idx[event] = {j:NA}

            continue

        [left_col1_excl, left_col1_incl] = map(int,left_events2counts[event][0].split(";"))
        [right_col1_excl, right_col1_incl] = map(int,right_events2counts[event][0].split(";"))


        if left_col1_excl + left_col1_incl < sum_thresh:
            continue # the reference must have a PSI

        if right_col1_excl + right_col1_incl < sum_thresh:
            continue # the reference must have a PSI

        # Adjust ref counts based on PSI
        if recalculate_ref_psi and has_virtual:
            left_col1_excl, left_col1_incl = adjustRefCounts(event,
                                                             jcn_seq_len,
                                                             left_lenNormalized_counts_event2total_counts[event][0],
                                                             float(allPSI_elems_left[0]), 
                                                             left_col1_excl, 
                                                             left_col1_incl)

            right_col1_excl, right_col1_incl = adjustRefCounts(event,
                                                               jcn_seq_len,
                                                               right_lenNormalized_counts_event2total_counts[event][0],
                                                               float(allPSI_elems_right[0]), 
                                                               right_col1_excl, 
                                                               right_col1_incl)


        for j in range(1,total_samples):

            [left_col2_excl, left_col2_incl] = map(int,left_events2counts[event][j].split(";"))
            [right_col2_excl, right_col2_incl] = map(int,right_events2counts[event][j].split(";"))

            if j not in event_type2col2pvals["intron_retention"]:
                event_type2col2pvals["intron_retention"][j] = []

            # Both samples have to be non-zero
            if (belowThreshold(sum_thresh, left_col1_excl, left_col1_incl,
                              left_col2_excl, left_col2_incl) or
                belowThreshold(sum_thresh, right_col1_excl, right_col1_incl,
                               right_col2_excl, right_col2_incl)):
                if event in event2pairs2idx:
                    event2pairs2idx[event][(0,j)] = NA
                else:
                    event2pairs2idx[event] = {(0,j):NA}
                if event in event2col2idx:
                    event2col2idx[event][j] = NA
                else:
                    event2col2idx[event] = {j:NA}
                continue

            cur_len = len(event_type2pvals["intron_retention"])
            cur_len2 = len(event_type2col2pvals["intron_retention"][j])

            if event in event2pairs2idx:
                event2pairs2idx[event][(0,j)] = cur_len
            else:
                event2pairs2idx[event] = {(0,j):cur_len}	

            if event in event2col2idx:
                event2col2idx[event][j] = cur_len2
            else:
                event2col2idx[event] = {j:cur_len2}

            left_pval = robjects.r['fisher.test'](r.matrix(robjects.IntVector([left_col1_excl, 
                                                                               left_col1_incl,
                                                                               left_col2_excl,
                                                                               left_col2_incl]),
                                                           nrow=2))[0][0] 
            
            right_pval = robjects.r['fisher.test'](r.matrix(robjects.IntVector([right_col1_excl, 
                                                                                right_col1_incl,
                                                                                right_col2_excl,
                                                                                right_col2_incl]),
                                                           nrow=2))[0][0] 

            combined_pval = (left_pval + right_pval) - left_pval * right_pval

            event_type2pvals["intron_retention"].append(combined_pval)

            updateDictOfLists(event_type2col2pvals["intron_retention"], j, combined_pval)


        # All samples have been processed, now print to allPSI
        if left_all_psi_output:
            left_all_psi_output.write(event + "\t" +
                                      "\t".join(allPSI_elems_left) + "\n")
        if right_all_psi_output:
            right_all_psi_output.write(event + "\t" +
                                      "\t".join(allPSI_elems_right) + "\n")

    if left_all_psi_output:
        left_all_psi_output.close()
    if right_all_psi_output:
        right_all_psi_output.close()

    # All pairs have been evaluated, so now do multiple testing correction on
    # everything
    event_type2adjusted_pvals = {}
    event_type2col2adjusted_pvals = {}

    for event_type in event_type2pvals:
        event_type2adjusted_pvals[event_type] = robjects.r['p.adjust'](robjects.FloatVector(event_type2pvals[event_type]),
                                                                       method) 
    
    for event_type in event_type2col2pvals:
        event_type2col2adjusted_pvals[event_type] = {}
        for col in event_type2col2pvals[event_type]:
            event_type2col2adjusted_pvals[event_type][col] = robjects.r['p.adjust'](robjects.FloatVector(event_type2col2pvals[event_type][col]),
                                                                                    method)

    # Now go through all events and only consider those that are signficant
    psi_out.write(header + "\n")
    if psi_out_by_samp:
        psi_out_by_samp.write(header + "\n")
    if all_psi_output:
        all_psi_output.write(header + "\n")

    for event in event2pairs2idx:
        sign_cols = set([])
        sign_cols2 = set([])
        event_type = getEventType(event)

        for pair in event2pairs2idx[event]:
            this_idx = event2pairs2idx[event][pair]
            this_idx2 = event2col2idx[event][pair[1]]
            if this_idx == NA:
                continue

            outline = "%s\t%d\t%d\t%f" % (event,
                                          pair[0], pair[1],
                                          event_type2pvals[event_type][this_idx])
            if psi_out_by_samp:
                outline += "\t%f" % event_type2col2adjusted_pvals[event_type][pair[1]][this_idx2]

            outline += "\t%f\n" % event_type2adjusted_pvals[event_type][this_idx]
            pval_out.write(outline)

            if event_type2adjusted_pvals[event_type][this_idx] < sign_cutoff:
                sign_cols.add(pair[0])
                sign_cols.add(pair[1])

            if psi_out_by_samp:
                if event_type2col2adjusted_pvals[event_type][pair[1]][this_idx2] < sign_cutoff:
                    sign_cols2.add(pair[0])
                    sign_cols2.add(pair[1])

        # Write out PSI for any significant samples
        # Significant across all samples
        if sign_cols != set([]):        
            psi_vals = []
            for i in range(total_samples):
                if i in sign_cols:
                    psi_vals.append(event2col2psi[event][i])
                else:
                    psi_vals.append(NA) 

            outline = "%s\t%s\n" % (event, 
                                    "\t".join(psi_vals))
            psi_out.write(outline)
   
        # Significant by samples 
        if sign_cols2 != set([]):        
            psi_vals = []
            for i in range(total_samples):
                if i in sign_cols2:
                    psi_vals.append(event2col2psi[event][i])
                    if event_sum:
                        event_sum.write("%s\t%d\t%s\n" % (event,
                                                          i,
                                                          event2col2sum[event][i]))
                else:
                    psi_vals.append(NA) 

            outline = "%s\t%s\n" % (event, 
                                    "\t".join(psi_vals))
            psi_out_by_samp.write(outline)

        # Print all psi
        if all_psi_output:
            psi_vals = []
            for i in range(total_samples):
                try:
                    psi_vals.append(event2col2psi[event][i])
                except:
                    psi_vals.append(NA)

            outline = "%s\t%s\n" % (event, 
                                    "\t".join(psi_vals))

            all_psi_output.write(outline)
    
    psi_out.close()
    psi_out_by_samp.close()
    all_psi_output.close()
    pval_out.close()

    sys.exit(0)
def main():
	
    opt_parser = OptionParser()
   
    # Add Options. Required options should have default=None
    opt_parser.add_option("-i",
                          dest="intron_coords",
                          type="string",
                          help="""File of intron coordinates.  Format:
                                  type, chr, strand, start, end""",
                          default=None)
    opt_parser.add_option("-b",
                          dest="bed_intron_coords",
                          type="string",
                          help="BED file of intron coordinates.",
                          default=None)
    opt_parser.add_option("-a",
                          dest="read_alignments",
                          type="string",
                          help="""File of alignments to genome. 
                                  Format:
                                  chr, start, strand""",
                          default=None)
    opt_parser.add_option("-f",
                          dest="flanking_dist",
                          type="int",
                          help="""Distance away from exon intron junction to
                                  check for reads in.""",
                          default=None)
    opt_parser.add_option("-o",
                          dest="offsets",
                          type="int",
                          help="""Minimum number of offsets required at each
                                  exon/intron junction. Default=1""",
                          default=1)
    opt_parser.add_option("-l",
                          dest="read_length",
                          type="int",
                          help="Length of the reads.",
                          default=1)
    opt_parser.add_option("--out_dir",
                          dest="out_dir",
                          type="string",
                          help="Output files are put here.",
                          default=None)
    opt_parser.add_option("--out_prefix",
                          dest="prefix",
                          type="string",
                          help="Prefix attached to all output files.",
                          default=None)

    (options, args) = opt_parser.parse_args()
	
    # validate the command line arguments
    opt_parser.check_required("-a")
    opt_parser.check_required("-f")
    opt_parser.check_required("-l")
    opt_parser.check_required("--out_dir")
    opt_parser.check_required("--out_prefix")

    # Check that the COUNTING_SCRIPT path is valid
    if not os.path.exists(COUNTING_SCRIPT):
        print "Please change COUNTING_SCRIPT path."
        opt_parser.print_help()
        sys.exit(1)

    if options.intron_coords and options.bed_intron_coords:
        print "Only one type of intron coord can be used as input." 
        opt_parser.print_help()
        sys.exit(1)

    if (not options.intron_coords) and (not options.bed_intron_coords):   
        print " Need to specify intron coordinates. See options -i or -b"
        opt_parser.print_help()
        sys.exit(1)

    intron_coords = None
    isBedFormat = False
    if options.intron_coords:
        intron_coords = open(options.intron_coords)
    if options.bed_intron_coords:
        intron_coords = open(options.bed_intron_coords)
        isBedFormat = True
        
    read_alignments = options.read_alignments

    read_length = options.read_length

    flanking_dist = options.flanking_dist
    offsets = options.offsets

    prefix = options.prefix
    out_dir = options.out_dir

    if not out_dir.endswith("/"):
        out_dir += "/"

    if not os.path.exists(out_dir):
        print "Output directory does not exist"
        sys.exit(1)

    # Intermediate Output Files
    out_coords_file = out_dir + prefix + "_intron_exon_junction_coords.out"
    out_coords = open(out_coords_file, "w")

    out_read_assoc_file = out_dir + prefix + "_intron_exon_junction_coords_w_read.out"
    
    # Final output
    out_file_name = out_dir + prefix + "_intron_exon_junction_counts.txt"
    out_file = open(out_file_name, "w")

    confident_ie_name = out_dir + prefix + "_confident_ie.txt"
    confident_ie_file = open(confident_ie_name, "w")
   
    # {intron_coord: {"left": (chr, start, end),
    #                 "right": (chr, start, end)}
    # "left" and "right" being the region at the left or right side of the
    # junction, around the exon/intron junction
    # The dict is the above but reverse mapping
    left_region_coord2intron = {}     
    right_region_coord2intron = {}     

    # {intron_coord_str:{"left":{pos:count}, 
    #                    "right":{pos:count}}
    intron_dict = {}

    regions_set = set([])

    for line in intron_coords:
        line = formatLine(line)

        if isBedFormat:
            if line.startswith("track"):
                continue
            chr, start_str, end_str = parseBEDLine(line)
        else:
            type, chr, strand, start_str, end_str = line.split("\t")

        if chr.startswith("chr"):
            chr = chr.replace("chr", "")

        intron_coord_str = "%s:%s-%s" % (chr, start_str, end_str)

        if intron_coord_str not in intron_dict:
            intron_dict[intron_coord_str] = {"left": {},
                                             "right": {}}

        start = int(start_str)
        end = int(end_str)

        left_coord = (chr,
                      start - flanking_dist,
                      start + flanking_dist - 1)

        right_coord = (chr,
                       end - flanking_dist + 1,
                       end + flanking_dist)

        updateDictOfLists(left_region_coord2intron, left_coord,
                          intron_coord_str)
        updateDictOfLists(right_region_coord2intron, right_coord,
                          intron_coord_str)

        regions_set.add(left_coord)
        regions_set.add(right_coord)

    # Print out regions out_coords
    for region_coord in regions_set:

        out_line = "%s\t%d\t%d\n" % (region_coord[0],
                                     region_coord[1],
                                     region_coord[2])

        out_coords.write(out_line)

    out_coords.close()

    # Used to make unique name for tmp file in case a shared directory is being
    # used for runs.
    rand_num = random.randrange(1,100000)

    # Get Read Counts
    print "Getting Counts in Region"
    cmd = "python %s --reads %s -l %d --coords %s -o %stmp%d.txt --read_assoc %s" % (COUNTING_SCRIPT,
                                                                          read_alignments,
                                                                          read_length,
                                                                          out_coords_file,
                                                                          out_dir,
                                                                          rand_num,
                                                                          out_read_assoc_file)
    print cmd
#    runCmd(cmd, SHELL)
    os.system(cmd)

    # Remove the tmp file
#    runCmd("rm %stmp%d.txt" % (out_dir, rand_num), SHELL)
    os.system("rm %stmp%d.txt" % (out_dir, rand_num))

    print "Getting Left and Right Counts"
    # Parse read_assoc_file to get information
    read_assoc_file = open(out_read_assoc_file)

    for line in read_assoc_file:
        line = formatLine(line)

        line_list = line.split("\t")

        read_start, read_end = getReadStartEnd(line_list[1]) 

        region_coord = getRegionCoord(line_list[2])
        intron_coord_list = getIntronStartEnds(left_region_coord2intron,
                                               right_region_coord2intron,
                                               region_coord)

       
        if region_coord in left_region_coord2intron: 
            for intron_str in left_region_coord2intron[region_coord]:
                # Put in left dictionaries
                if read_end not in intron_dict[intron_str]["left"]:
                    intron_dict[intron_str]["left"][read_end] = 1
                else:
                    intron_dict[intron_str]["left"][read_end] += 1

        if region_coord in right_region_coord2intron:
            for intron_str in right_region_coord2intron[region_coord]:
                # Check right dictionary
                if read_end not in intron_dict[intron_str]["right"]:
                    intron_dict[intron_str]["right"][read_end] = 1
                else:
                    intron_dict[intron_str]["right"][read_end] += 1

    # Print output
    confident_ie_set = set([])
    for intron_str in intron_dict:
#       chr, intron_start_str, intron_end_str = intron_str.split("_")
#       intron_start = int(intron_start_str)
#       intron_end = int(intron_end_str) 
        chr, intron_start, intron_end = convertCoordStr(intron_str)

        # Get left_counts
        if len(intron_dict[intron_str]["left"]) >= offsets:
            left_count = getTotalCounts(intron_dict[intron_str]["left"])
            confident_ie = "%s:%d-%d" % (chr, intron_start - 1, intron_start)
            confident_ie_set.add(confident_ie)
        else:
            left_count = 0

        # Get right counts
        if len(intron_dict[intron_str]["right"]) >= offsets:
            right_count = getTotalCounts(intron_dict[intron_str]["right"])
            confident_ie = "%s:%d-%d" % (chr, intron_end, intron_end + 1)
            confident_ie_set.add(confident_ie)
        else:
            right_count = 0

        if left_count == 0 and right_count == 0:
            continue

        print_line = "%s\t%d\t%d\n" % (intron_str,
                                     left_count,
                                     right_count)

        out_file.write(print_line)

    # Now print out confident set of ie
    for ie in confident_ie_set:
        confident_ie_file.write("%s\n" % ie)

    confident_ie_file.close()
	
    sys.exit(0)