Beispiel #1
0
def predic_unscore_api(inputtext):
    charstop = True  # True means label attributes to previous char
    features = 3  # 1=discrete; 2=vectors; 3=both
    dictfile = 'vector/24scbow50.txt'
    modelname = 'datalunyu5001.m'
    vdict = util.readvec(dictfile)
    inputtext = inputtext
    #li = [line for line in util.text_to_lines(inputtext)]
    li = util.text_to_lines(inputtext)

    print(li)
    data = []
    for line in li:
        x, y = util.line_toseq(line, charstop)
        print(x)
        if features == 1:
            d = crf.x_seq_to_features_discrete(x, charstop), y
        elif features == 2:
            d = crf.x_seq_to_features_vector(x, vdict, charstop), y
        elif features == 3:
            d = crf.x_seq_to_features_both(x, vdict, charstop), y
        data.append(d)

    tagger = pycrfsuite.Tagger()
    tagger.open(modelname)
    print("Start testing...")
    results = []
    lines = []
    Spp = []
    Npp = []
    out = []
    #while data:
    for index in range(len(data)):
        print(len(data))
        xseq, yref = data.pop(0)
        yout = tagger.tag(xseq)
        sp = 0
        np = 0
        for i in range(len(yout)):
            sp = tagger.marginal('S', i)
            Spp.append(sp)  #S標記的機率
            print(sp)
            np = tagger.marginal('N', i)
            Npp.append(np)  #Nㄅ標記的機率
            print(np)
        results.append(util.eval(yref, yout, "S"))
        lines.append(
            util.seq_to_line([x['gs0'] for x in xseq], yout, charstop, Spp,
                             Npp))
        #print(util.seq_to_line([x['gs0'] for x in xseq], (str(sp) +'/'+ str(np)),charstop))
        out.append(yout)

    tp, fp, fn, tn = zip(*results)
    tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn)

    p, r = tp / (tp + fp), tp / (tp + fn)
    score = ''
    score = score + '<br>' + "Total tokens in Test Set:" + repr(tp + fp + fn +
                                                                tn)
    score = score + '<br>' + "Total S in REF:" + repr(tp + fn)
    score = score + '<br>' + "Total S in OUT:" + repr(tp + fp)
    score = score + '<br>' + "Presicion:" + repr(p)
    score = score + '<br>' + "Recall:" + repr(r)
    score = score + '<br>' + "*******************F1-score:" + repr(2 * p * r /
                                                                   (p + r))

    output = ''
    print(lines)

    for line in lines:
        #line = unquote(line)
        print("output:")
        print(line.encode('utf8'))
        #output = output + '<br>' + line
        output += line
        print(line)
    output = score + '<br>' + output

    #output = jsonify({'str': output})

    return (out)
Beispiel #2
0
def predic():
    charstop = True  # True means label attributes to previous char
    features = 3  # 1=discrete; 2=vectors; 3=both
    dictfile = 'vector/24scbow50.txt'
    modelname = 'datalunyu5001.m'
    vdict = util.readvec(dictfile)
    inputtext = request.form.get('input_text', '')
    #li = [line for line in util.text_to_lines(inputtext)]
    li = util.text_to_lines(inputtext)

    print(li)
    data = []
    for line in li:
        x, y = util.line_toseq(line, charstop)
        print(x)
        if features == 1:
            d = crf.x_seq_to_features_discrete(x, charstop), y
        elif features == 2:
            d = crf.x_seq_to_features_vector(x, vdict, charstop), y
        elif features == 3:
            d = crf.x_seq_to_features_both(x, vdict, charstop), y
        data.append(d)

    tagger = pycrfsuite.Tagger()
    tagger.open(modelname)
    print("Start testing...")
    results = []
    lines = []
    Spp = []
    Npp = []
    #while data:
    for index in range(len(data)):
        print(len(data))
        xseq, yref = data.pop(0)
        yout = tagger.tag(xseq)
        sp = 0
        np = 0
        for i in range(len(yout)):
            sp = tagger.marginal('S', i)
            Spp.append(sp)  #S標記的機率
            print(sp)
            np = tagger.marginal('N', i)
            Npp.append(np)  #Nㄅ標記的機率
            print(np)
        results.append(util.eval(yref, yout, "S"))
        lines.append(
            util.seq_to_line([x['gs0'] for x in xseq], yout, charstop, Spp,
                             Npp))
        #print(util.seq_to_line([x['gs0'] for x in xseq], (str(sp) +'/'+ str(np)),charstop))

    U_score = 0
    p_Scount = 0
    p_Ncount = 0
    for i in range(len(Spp)):
        _s = 0
        if Spp[i] > Npp[i]:
            _s = Spp[i]
        else:
            _s = Npp[i]
        _s = (_s - 0.5) * 10
        U_score = U_score + _s
        p_Scount = p_Scount + Spp[i]
        p_Ncount = p_Ncount + Npp[i]

    tp, fp, fn, tn = zip(*results)
    tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn)

    p, r = tp / (tp + fp), tp / (tp + fn)
    score = ''
    score = score + '<br>' + "Total tokens in Test Set:" + repr(tp + fp + fn +
                                                                tn)
    score = score + '<br>' + "Total S in REF:" + repr(tp + fn)
    score = score + '<br>' + "Total S in OUT:" + repr(tp + fp)
    score = score + '<br>' + "Presicion:" + repr(p)
    score = score + '<br>' + "Recall:" + repr(r)
    score = score + '<br>' + "*******************F1-score:" + repr(2 * p * r /
                                                                   (p + r))
    score = score + '<br>' + "======================="
    score = score + '<br>' + "character count:" + str(len(Spp))
    score = score + '<br>' + "block uncertain rate:" + str(
        (U_score / len(Spp)))

    output = ''
    key = 0
    for line in lines:
        #print (line.encode('utf8'))

        output = output + '<br>' + line
        #print (line)
        key = key + 1

    #for index_m in ypp:
    #  output = output + '<br>' + line

    output = score + '<br>' + output

    return (output)
     xseq, yref = testdata.pop()
     #print(xseq)
     yout = tagger.tag(xseq)
     all_len += len(yout)
     #print(len(xseq),len(yout),all_len)
     sp = 0
     np = 0
     for i in range(len(yout)):
         sp = tagger.marginal('S',i)
         Spp.append(sp) #S標記的機率
         #print(sp)
         np = tagger.marginal('N',i) 
         Npp.append(np)#N標記的機率
         #print(np)
     results.append(util.eval(yref, yout, "S"))
     lines.append(util.seq_to_line([x['gs0'] for x in xseq],yout,charstop,Spp,Npp))
     #print(util.seq_to_line([x['gs0'] for x in xseq], (str(sp) +'/'+ str(np)),charstop))
     
     score_array = []
     All_u_score = 0
     for i in range(len(Spp)):
         _s = 0
         if Spp[i] > Npp[i]: _s = Spp[i]
         else :_s = Npp[i]
         #_s = (_s - 0.5) * 10
         _s = (1 - _s)
         #U_score = U_score + _s
         score_array.append(_s)
 for i in range(len(testidx)):
     U_score = 0 #文本區塊的不確定值
     text_count = 0 #字數
Beispiel #4
0
    elif features == 3:
        d = crf.x_seq_to_features_both(x, vdict, charstop), y
    data.append(d)

tagger = pycrfsuite.Tagger()
tagger.open(modelname)

print datetime.datetime.now()
print "Start testing..."
results = []
lines = []
while data:
    xseq, yref = data.pop()
    yout = tagger.tag(xseq)
    results.append(util.eval(yref, yout, "S"))
    lines.append(util.seq_to_line([x['gs0'] for x in xseq],yout,charstop))

tp, fp, fn, tn = zip(*results)
tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn)

p, r = tp/(tp+fp), tp/(tp+fn)
print "Total tokens in Test Set:", tp+fp+fn+tn
print "Total S in REF:", tp+fn
print "Total S in OUT:", tp+fp
print "Presicion:", p
print "Recall:", r
print "*******************F1-score:", 2*p*r/(p+r)

for line in lines:
    print line.encode('utf8')
Beispiel #5
0
stt = datetime.datetime.now()
print "Start training...", stt
hmmtagger = nt.hmm.HiddenMarkovModelTagger.train(traindata)


print "################# Training took:", datetime.datetime.now()-stt
results = []
lines = []
testdata = [line for line in util.file_to_lines(glob.glob(hu))]
for line in testdata:
    x, yref = util.line_toseq(line, charstop)
    out = hmmtagger.tag(x)
    _, yout = zip(*out)
    yout = list(yout)
    results.append(util.eval(yref, yout, "S"))
    lines.append(util.seq_to_line(x,yout,charstop))
tp, fp, fn, tn = zip(*results)
tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn)

p, r = tp/(tp+fp), tp/(tp+fn)
print "Total tokens in Test Set:", tp+fp+fn+tn
print "Total S in REF:", tp+fn
print "Total S in OUT:", tp+fp
print "Presicion:", p
print "Recall:", r
print "F1-score:", 2*p*r/(p+r)

while lines:
    print lines.pop().encode('utf8')
Beispiel #6
0
stt = datetime.datetime.now()
print (traindata[0])
print ("Start training...", stt)
hmmtagger = nt.hmm.HiddenMarkovModelTagger.train(traindata)
#hmmtagger = nt.HiddenMarkovModelTagger.train(traindata)

print ("################# Training took:", datetime.datetime.now()-stt)
results = []
lines = []
testdata = [line for line in util.file_to_lines(glob.glob(hu))]
for line in testdata:
    x, yref = util.line_toseq(line, charstop)
    out = hmmtagger.tag(x)
    _, yout = zip(*out)
    yout = list(yout)
    results.append(util.eval(yref, yout, "S"))
    lines.append(util.seq_to_line(x,yout,charstop))
tp, fp, fn, tn = zip(*results)
tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn)

p, r = tp/(tp+fp), tp/(tp+fn)
print ("Total tokens in Test Set:", tp+fp+fn+tn)
print ("Total S in REF:", tp+fn)
print ("Total S in OUT:", tp+fp)
print ("Presicion:", p)
print ("Recall:", r)
print ("F1-score:", 2*p*r/(p+r))

while lines:
    print (lines.pop().encode('utf8'))