def process_locus(lr, srin, args):
    if len(lr) == 0: return None
    totalrange = get_total_range(lr)
    #print '^^^^ Locus ^^^^'
    #print totalrange.get_range_string()
    #print str(len(lr))+"\t"+str(len(sr))+"\t"+str(len(srjun))
    # Get fuzzys from of all short reads
    sr = {}
    #do this more time consuming cutdown ont he SR data after sending to a thread
    for srgpd in srin:
        srfz = GenePredFuzzyBasics.FuzzyGenePred(
            srgpd, juntol=args.junction_tolerance)
        for j in srfz.fuzzy_junctions:
            junstr = j.left.chr + ':' + str(j.left.end) + ',' + str(
                j.right.end)
            if junstr not in sr:
                sr[junstr] = {}
                sr[junstr]['cnt'] = 0
                sr[junstr]['fzjun'] = j
            sr[junstr]['cnt'] += 1

    #srfzs = [GenePredFuzzyBasics.FuzzyGenePred(x) for x in srjun]
    #for i in range(0,len(srfzs)): srfzs[i].gpds[0].entry['name'] = 'SR_'+str(i)
    fzs = GenePredFuzzyBasics.greedy_gpd_list_to_combined_fuzzy_list(
        lr, args.junction_tolerance)
    #print str(len(fzs)) + " genepreds"
    outputs = []
    #if args.threads > 1:
    #  p = Pool(processes=args.threads)
    for fz in fzs:
        #if args.by_read:
        #  if args.threads > 1 and args.by_read:
        #    p.apply_async(do_fuzzy,args=(fz,sr,args),callback=do_outs)
        #  else:
        #    outs = do_fuzzy(fz,sr,args)
        #    do_outs([outs,totalrange])
        #else:
        outs = do_fuzzy(fz, sr, args)
        #  do_outs([outs,totalrange])
        for o in outs:
            outputs.append(o)
    #if args.threads > 1 and args.by_read:
    #  p.close()
    #  p.join()
    #if not args.by_read:
    return [outputs, totalrange]
Ejemplo n.º 2
0
def process_locus(lr, srin, args):
    if len(lr) == 0:
        return None
    totalrange = get_total_range(lr)
    # print '^^^^ Locus ^^^^'
    # print totalrange.get_range_string()
    # print str(len(lr))+"\t"+str(len(sr))+"\t"+str(len(srjun))
    # Get fuzzys from of all short reads
    sr = {}
    # do this more time consuming cutdown ont he SR data after sending to a thread
    for srgpd in srin:
        srfz = GenePredFuzzyBasics.FuzzyGenePred(srgpd, juntol=args.junction_tolerance)
        for j in srfz.fuzzy_junctions:
            junstr = j.left.chr + ":" + str(j.left.end) + "," + str(j.right.end)
            if junstr not in sr:
                sr[junstr] = {}
                sr[junstr]["cnt"] = 0
                sr[junstr]["fzjun"] = j
            sr[junstr]["cnt"] += 1

    # srfzs = [GenePredFuzzyBasics.FuzzyGenePred(x) for x in srjun]
    # for i in range(0,len(srfzs)): srfzs[i].gpds[0].entry['name'] = 'SR_'+str(i)
    fzs = GenePredFuzzyBasics.greedy_gpd_list_to_combined_fuzzy_list(lr, args.junction_tolerance)
    # print str(len(fzs)) + " genepreds"
    outputs = []
    # if args.threads > 1:
    #  p = Pool(processes=args.threads)
    for fz in fzs:
        # if args.by_read:
        #  if args.threads > 1 and args.by_read:
        #    p.apply_async(do_fuzzy,args=(fz,sr,args),callback=do_outs)
        #  else:
        #    outs = do_fuzzy(fz,sr,args)
        #    do_outs([outs,totalrange])
        # else:
        outs = do_fuzzy(fz, sr, args)
        #  do_outs([outs,totalrange])
        for o in outs:
            outputs.append(o)
    # if args.threads > 1 and args.by_read:
    #  p.close()
    #  p.join()
    # if not args.by_read:
    return [outputs, totalrange]
Ejemplo n.º 3
0
def main():
    #do our inputs
    args = do_inputs()
    global gout
    gout = args.output
    gls = GenePredBasics.GenePredLocusStream(args.input)
    fgs = GenePredFuzzyBasics.FuzzyGenePredSeparator()
    if args.threads > 1:
        p = Pool(processes=args.threads)
    while True:
        buffer = gls.read_locus()
        if not buffer: break
        if args.threads > 1:
            p.apply_async(process_buffer,
                          args=(buffer, args),
                          callback=out_gpds)
        else:
            v = process_buffer(buffer, args)
            out_gpds(v)
    if args.threads > 1:
        p.close()
        p.join()
    sys.stderr.write("\n")
def evaluate_junctions(fz, sr, args):
    cnt = 0
    source_names = [x.entry['name'] for x in fz.gpds]
    working = fz.copy()
    if len(working.fuzzy_junctions) == 0: return []
    for i in range(0, len(working.fuzzy_junctions)):
        newjun = working.fuzzy_junctions[i]
        newjun.left.get_payload()['junc'] = []
        newjun.right.get_payload()['junc'] = []
        oldjun = fz.fuzzy_junctions[i]
        for srjun in sr:
            sjun = sr[srjun]['fzjun']
            if oldjun.overlaps(sjun, args.junction_tolerance):
                for i in range(0, min(sr[srjun]['cnt'], args.downsample)):
                    newjun.left.get_payload()['junc'].append(
                        sjun.left.get_payload()['junc'][0])
                    newjun.right.get_payload()['junc'].append(
                        sjun.right.get_payload()['junc'][0])
                    cnt += 1
    juncs = []
    starts = []
    ends = []
    evidences = []
    for i in range(0, len(fz.fuzzy_junctions)):
        evidence = len(working.fuzzy_junctions[i].left.get_payload()['junc'])
        if evidence >= args.required_evidence:
            if i == 0:
                starts.append(working.start.start)
            elif working.fuzzy_junctions[i].left.get_payload()['start']:
                starts.append(working.fuzzy_junctions[i].left.get_payload()
                              ['start'].start)
            else:
                starts.append(working.fuzzy_junctions[i - 1].right.start)
            #now ends
            if i == len(fz.fuzzy_junctions) - 1:
                ends.append(working.end.end)
            elif working.fuzzy_junctions[i].right.get_payload()['end']:
                ends.append(
                    working.fuzzy_junctions[i].right.get_payload()['end'].end)
            else:
                ends.append(working.fuzzy_junctions[i + 1].left.end)
            bestleft = GenePredFuzzyBasics.mode(
                working.fuzzy_junctions[i].left.get_payload()['junc'])
            bestright = GenePredFuzzyBasics.mode(
                working.fuzzy_junctions[i].right.get_payload()['junc'])
            juncs.append([bestleft, bestright])
            #print 'jun '+str(i)+' evid: '+str(evidence)+" "+str(bestleft)+" "+str(bestright)
        else:
            starts.append([])
            ends.append([])
            juncs.append([])
        evidences.append(evidence)
    #print juncs
    #print starts
    #print ends
    #print evidences
    # now we can put together the runs
    runs = []
    current_run = []
    for i in range(0, len(evidences)):
        if evidences[i] < args.required_evidence:
            if len(current_run) > 0:
                runs.append(current_run)
            current_run = []
            continue
        current_run.append(i)
    if len(current_run) > 0:
        runs.append(current_run)
    # now the runs are in runs
    #print 'runs:'
    parts = []
    for run in runs:
        sarr = []
        sarr.append(starts[run[0]] - 1)  #put back to zero index
        earr = []
        for i in range(0, len(run)):
            sarr.append(juncs[run[i]][1] - 1)
            earr.append(juncs[run[i]][0])
        earr.append(ends[run[-1]])
        # ready to build a genepred!
        part = ''
        part += str(working.start.chr) + "\t"
        part += '+' + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(len(sarr)) + "\t"
        part += ','.join([str(x) for x in sarr]) + ',' + "\t"
        part += ','.join([str(x) for x in earr]) + ','
        # Final quality check here
        gpd = GenePredEntry("test1\ttest1\t" + part)
        if not gpd.is_valid():
            sys.stderr.write("\nWARNING skipping invalid GPD\n" +
                             gpd.get_line() + "\n")
            continue
        parts.append([part, source_names])
    #print parts
    return parts
Ejemplo n.º 5
0
def evaluate_junctions(fz, sr, args):
    cnt = 0
    source_names = [x.entry["name"] for x in fz.gpds]
    working = fz.copy()
    if len(working.fuzzy_junctions) == 0:
        return []
    for i in range(0, len(working.fuzzy_junctions)):
        newjun = working.fuzzy_junctions[i]
        newjun.left.get_payload()["junc"] = []
        newjun.right.get_payload()["junc"] = []
        oldjun = fz.fuzzy_junctions[i]
        for srjun in sr:
            sjun = sr[srjun]["fzjun"]
            if oldjun.overlaps(sjun, args.junction_tolerance):
                for i in range(0, min(sr[srjun]["cnt"], args.downsample)):
                    newjun.left.get_payload()["junc"].append(sjun.left.get_payload()["junc"][0])
                    newjun.right.get_payload()["junc"].append(sjun.right.get_payload()["junc"][0])
                    cnt += 1
    juncs = []
    starts = []
    ends = []
    evidences = []
    for i in range(0, len(fz.fuzzy_junctions)):
        evidence = len(working.fuzzy_junctions[i].left.get_payload()["junc"])
        if evidence >= args.required_evidence:
            if i == 0:
                starts.append(working.start.start)
            elif working.fuzzy_junctions[i].left.get_payload()["start"]:
                starts.append(working.fuzzy_junctions[i].left.get_payload()["start"].start)
            else:
                starts.append(working.fuzzy_junctions[i - 1].right.start)
            # now ends
            if i == len(fz.fuzzy_junctions) - 1:
                ends.append(working.end.end)
            elif working.fuzzy_junctions[i].right.get_payload()["end"]:
                ends.append(working.fuzzy_junctions[i].right.get_payload()["end"].end)
            else:
                ends.append(working.fuzzy_junctions[i + 1].left.end)
            bestleft = GenePredFuzzyBasics.mode(working.fuzzy_junctions[i].left.get_payload()["junc"])
            bestright = GenePredFuzzyBasics.mode(working.fuzzy_junctions[i].right.get_payload()["junc"])
            juncs.append([bestleft, bestright])
            # print 'jun '+str(i)+' evid: '+str(evidence)+" "+str(bestleft)+" "+str(bestright)
        else:
            starts.append([])
            ends.append([])
            juncs.append([])
        evidences.append(evidence)
    # print juncs
    # print starts
    # print ends
    # print evidences
    # now we can put together the runs
    runs = []
    current_run = []
    for i in range(0, len(evidences)):
        if evidences[i] < args.required_evidence:
            if len(current_run) > 0:
                runs.append(current_run)
            current_run = []
            continue
        current_run.append(i)
    if len(current_run) > 0:
        runs.append(current_run)
    # now the runs are in runs
    # print 'runs:'
    parts = []
    for run in runs:
        sarr = []
        sarr.append(starts[run[0]] - 1)  # put back to zero index
        earr = []
        for i in range(0, len(run)):
            sarr.append(juncs[run[i]][1] - 1)
            earr.append(juncs[run[i]][0])
        earr.append(ends[run[-1]])
        # ready to build a genepred!
        part = ""
        part += str(working.start.chr) + "\t"
        part += "+" + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(len(sarr)) + "\t"
        part += ",".join([str(x) for x in sarr]) + "," + "\t"
        part += ",".join([str(x) for x in earr]) + ","
        # Final quality check here
        gpd = GenePredEntry("test1\ttest1\t" + part)
        if not gpd.is_valid():
            sys.stderr.write("\nWARNING skipping invalid GPD\n" + gpd.get_line() + "\n")
            continue
        parts.append([part, source_names])
    # print parts
    return parts
Ejemplo n.º 6
0
def process_buffer(buffer, args):
    fzs = GenePredFuzzyBasics.greedy_gpd_list_to_combined_fuzzy_list(
        buffer, args.junction_tolerance)
    return [fzs, args]
def process_buffer(buffer,args):
  fzs = GenePredFuzzyBasics.greedy_gpd_list_to_combined_fuzzy_list(buffer,args.junction_tolerance)
  return [fzs,args]