def main():
    parser = argparse.ArgumentParser(
        description='Use reference junctions when they are close',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--min_intron_size',
                        type=int,
                        default=68,
                        help="INT min intron size")
    parser.add_argument(
        '--min_local_support',
        type=int,
        default=0,
        help=
        "INT min number of junctions within search_size of a junction in order to count it"
    )
    parser.add_argument('--search_size',
                        type=int,
                        default=10,
                        help="INT search space for reference")
    parser.add_argument(
        '--output_fake_psl',
        help="FASTAFILE reference genome to make a fake PSL output")
    parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN")
    parser.add_argument('reference_genepred',
                        help="FASTAFILENAME for reference genepred")
    args = parser.parse_args()

    cpus = multiprocessing.cpu_count()

    genome = {}
    if args.output_fake_psl:
        genome = read_fasta_into_hash(args.output_fake_psl)

    #read in the reference genepred first
    gpf = GenePredBasics.GenePredFile(args.reference_genepred)
    #lets sort entries by chromosome
    ref = {}
    for e in [x.entry for x in gpf.entries]:
        if len(e['exonStarts']) <= 1: continue
        if e['chrom'] not in ref:
            ref[e['chrom']] = {}
        for i in range(1, len(e['exonStarts'])):
            if e['exonEnds'][i - 1] not in ref[e['chrom']]:
                ref[e['chrom']][e['exonEnds'][i - 1]] = {}
            if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i -
                                                                           1]]:
                ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] +
                                                      1] = e['strand']
    #Stored all junctions as 1-base

    read_info = {}
    pf = GenericFileReader(args.psl)
    fcount_total = 0
    while True:
        line = pf.readline()
        if not line: break
        if re.match('^#', line): continue
        line = line.rstrip()
        pe = PSLBasics.line_to_entry(line)
        if len(pe['tStarts']) != len(pe['blockSizes']) or len(
                pe['qStarts']) != len(pe['blockSizes']):
            sys.stderr.write("WARNING invalid psl\n")
            continue
        genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
        ge = GenePredBasics.smooth_gaps(
            GenePredBasics.line_to_entry(genepred_line), args.min_intron_size)
        refjuns = {}
        if pe['tName'] in ref: refjuns = ref[pe['tName']]
        new_ge = nudge(pe, ge, refjuns, args)
        if args.output_fake_psl:
            new_psl_line = GenePredBasics.entry_to_fake_psl_line(
                new_ge, genome)
            print new_psl_line
        else:
            print GenePredBasics.entry_to_line(new_ge)
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('a', nargs=1, help='FILENAME genepred file A')
    parser.add_argument('b', nargs=1, help='FILENAME genepred file B')
    #parser.add_argument('-p',nargs='?',help='INT the number of threads to run.')
    parser.add_argument('--minexoncount',
                        nargs='?',
                        help='INT the minimum number of exons required.')
    parser.add_argument(
        '--minoverlap_internal',
        nargs='?',
        help=
        'FLOAT the fraction (0-1) of the required reciprocal overlap of an internal exon to call an exon a match.'
    )
    parser.add_argument(
        '--minoverlap_first',
        nargs='?',
        help=
        'FLOAT the fraction (0-1) of the required reciprocal overlap of the first exon to call an exon a match.'
    )
    parser.add_argument(
        '--minoverlap_last',
        nargs='?',
        help=
        'FLOAT the fraction (0-1) of the required reciprocal overlap of the last exon to call an exon a match.'
    )
    parser.add_argument(
        '--minoverlap',
        nargs='?',
        help=
        'FLOAT the fraction (0-1) of the required reciprocal overlap of any exon to call an exon a match.'
    )
    parser.add_argument(
        '--leftouterjoin',
        action='store_true',
        help=
        'Output the entry A regardless of whether a matching entry in B is found'
    )
    parser.add_argument('--output_a_not_in_b',
                        action='store_true',
                        help='Output entries that occur in A but not B')
    parser.add_argument(
        '--best_b_only',
        action='store_true',
        help=
        'Output only one entry of B for each A and try to pick the best based on reciprocal overlap'
    )
    parser.add_argument(
        '--allow_a_subset_of_b_fragments',
        action='store_true',
        help=
        'If A is just a subset of B, then call it as a match.  This means all exons of A found a conecutive match, but B could have more exons on either end.'
    )
    parser.add_argument(
        '--allow_any_fragments',
        action='store_true',
        help='If set, allow any partial match, not just the best')
    args = parser.parse_args()

    #pcount = multiprocessing.cpu_count()
    #if args.p: pcount = int(args.p)
    # go through contingencies of overlap requirements and set them
    overlap = [0, 0, 0]
    if args.minoverlap:
        overlap = [
            float(args.minoverlap),
            float(args.minoverlap),
            float(args.minoverlap)
        ]
    if args.minoverlap_first:
        overlap[0] = float(args.minoverlap_last)
    if args.minoverlap_last:
        overlap[2] = float(args.minoverlap_last)
    if args.minoverlap_internal:
        overlap[1] = float(args.minoverlap_internal)

    # read the genepred files
    gpdA = GenePredBasics.GenePredFile(args.a[0])
    gpdB = GenePredBasics.GenePredFile(args.b[0])

    #if pcount > 1:
    #  p = multiprocessing.Pool(processes=pcount)
    for eA in gpdA.entries:
        #if pcount > 1:
        #  p.apply_async(check_B_entries,[eA,overlap,args])
        #else:
        check_B_entries(eA, gpdB, overlap, args)