Beispiel #1
0
def dataary(li, gram, features, vdict):
    data = []
    lineary = ''
    for _line in li:
        lineary += _line

    lineary = lineary.replace(":", "")
    lineary = lineary.replace("、", "")
    lineary = lineary.replace("!", "")
    lineary = lineary.replace(".", "")
    lineary = lineary.replace("?", "")
    lineary = lineary.replace("》", "")
    lineary = lineary.replace("《", "")
    x, y = util.line_toseq(lineary, charstop)
    del y[0]
    y = y + ['N']
    #這邊在做文本做gram
    if features == 1:
        d = crf.x_seq_to_features_discrete(x, charstop, gram), y
    elif features == 2:
        d = crf.x_seq_to_features_vector(x, vdict, charstop), y
    elif features == 3:
        d = crf.x_seq_to_features_both(x, vdict, charstop, gram), y
    #d = crf.x_seq_to_features_discrete(x, charstop,gram), y
    data.append(d)
    return data
Beispiel #2
0
def dataary(li, gram):
    data = []
    for line in li:
        x, y = util.line_toseq(line, charstop)
        #print(x)
        #print(y[:5])

        #這邊在做文本做gram
        d = crf.x_seq_to_features_discrete(x, charstop, gram), y
        data.append(d)
    return data
def dataary(li,gram,features,vdict):
    data = []
    for line in li:          
        x, y = util.line_toseq(line, charstop)
        #print(x)
        #print(y[:5])
        #這邊在做文本做gram
        if features == 1:
            d = crf.x_seq_to_features_discrete(x, charstop,gram), y
        elif features == 2:
            d = crf.x_seq_to_features_vector(x, vdict, charstop), y
        elif features == 3:
            d = crf.x_seq_to_features_both(x, vdict, charstop,gram), y
        #d = crf.x_seq_to_features_discrete(x, charstop,gram), y
        data.append(d)
    return data
Beispiel #4
0
def predic_unscore_api(inputtext):
    charstop = True  # True means label attributes to previous char
    features = 3  # 1=discrete; 2=vectors; 3=both
    dictfile = 'vector/24scbow50.txt'
    modelname = 'datalunyu5001.m'
    vdict = util.readvec(dictfile)
    inputtext = inputtext
    #li = [line for line in util.text_to_lines(inputtext)]
    li = util.text_to_lines(inputtext)

    print(li)
    data = []
    for line in li:
        x, y = util.line_toseq(line, charstop)
        print(x)
        if features == 1:
            d = crf.x_seq_to_features_discrete(x, charstop), y
        elif features == 2:
            d = crf.x_seq_to_features_vector(x, vdict, charstop), y
        elif features == 3:
            d = crf.x_seq_to_features_both(x, vdict, charstop), y
        data.append(d)

    tagger = pycrfsuite.Tagger()
    tagger.open(modelname)
    print("Start testing...")
    results = []
    lines = []
    Spp = []
    Npp = []
    out = []
    #while data:
    for index in range(len(data)):
        print(len(data))
        xseq, yref = data.pop(0)
        yout = tagger.tag(xseq)
        sp = 0
        np = 0
        for i in range(len(yout)):
            sp = tagger.marginal('S', i)
            Spp.append(sp)  #S標記的機率
            print(sp)
            np = tagger.marginal('N', i)
            Npp.append(np)  #Nㄅ標記的機率
            print(np)
        results.append(util.eval(yref, yout, "S"))
        lines.append(
            util.seq_to_line([x['gs0'] for x in xseq], yout, charstop, Spp,
                             Npp))
        #print(util.seq_to_line([x['gs0'] for x in xseq], (str(sp) +'/'+ str(np)),charstop))
        out.append(yout)

    tp, fp, fn, tn = zip(*results)
    tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn)

    p, r = tp / (tp + fp), tp / (tp + fn)
    score = ''
    score = score + '<br>' + "Total tokens in Test Set:" + repr(tp + fp + fn +
                                                                tn)
    score = score + '<br>' + "Total S in REF:" + repr(tp + fn)
    score = score + '<br>' + "Total S in OUT:" + repr(tp + fp)
    score = score + '<br>' + "Presicion:" + repr(p)
    score = score + '<br>' + "Recall:" + repr(r)
    score = score + '<br>' + "*******************F1-score:" + repr(2 * p * r /
                                                                   (p + r))

    output = ''
    print(lines)

    for line in lines:
        #line = unquote(line)
        print("output:")
        print(line.encode('utf8'))
        #output = output + '<br>' + line
        output += line
        print(line)
    output = score + '<br>' + output

    #output = jsonify({'str': output})

    return (out)
Beispiel #5
0
def predic():
    charstop = True  # True means label attributes to previous char
    features = 3  # 1=discrete; 2=vectors; 3=both
    dictfile = 'vector/24scbow50.txt'
    modelname = 'datalunyu5001.m'
    vdict = util.readvec(dictfile)
    inputtext = request.form.get('input_text', '')
    #li = [line for line in util.text_to_lines(inputtext)]
    li = util.text_to_lines(inputtext)

    print(li)
    data = []
    for line in li:
        x, y = util.line_toseq(line, charstop)
        print(x)
        if features == 1:
            d = crf.x_seq_to_features_discrete(x, charstop), y
        elif features == 2:
            d = crf.x_seq_to_features_vector(x, vdict, charstop), y
        elif features == 3:
            d = crf.x_seq_to_features_both(x, vdict, charstop), y
        data.append(d)

    tagger = pycrfsuite.Tagger()
    tagger.open(modelname)
    print("Start testing...")
    results = []
    lines = []
    Spp = []
    Npp = []
    #while data:
    for index in range(len(data)):
        print(len(data))
        xseq, yref = data.pop(0)
        yout = tagger.tag(xseq)
        sp = 0
        np = 0
        for i in range(len(yout)):
            sp = tagger.marginal('S', i)
            Spp.append(sp)  #S標記的機率
            print(sp)
            np = tagger.marginal('N', i)
            Npp.append(np)  #Nㄅ標記的機率
            print(np)
        results.append(util.eval(yref, yout, "S"))
        lines.append(
            util.seq_to_line([x['gs0'] for x in xseq], yout, charstop, Spp,
                             Npp))
        #print(util.seq_to_line([x['gs0'] for x in xseq], (str(sp) +'/'+ str(np)),charstop))

    U_score = 0
    p_Scount = 0
    p_Ncount = 0
    for i in range(len(Spp)):
        _s = 0
        if Spp[i] > Npp[i]:
            _s = Spp[i]
        else:
            _s = Npp[i]
        _s = (_s - 0.5) * 10
        U_score = U_score + _s
        p_Scount = p_Scount + Spp[i]
        p_Ncount = p_Ncount + Npp[i]

    tp, fp, fn, tn = zip(*results)
    tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn)

    p, r = tp / (tp + fp), tp / (tp + fn)
    score = ''
    score = score + '<br>' + "Total tokens in Test Set:" + repr(tp + fp + fn +
                                                                tn)
    score = score + '<br>' + "Total S in REF:" + repr(tp + fn)
    score = score + '<br>' + "Total S in OUT:" + repr(tp + fp)
    score = score + '<br>' + "Presicion:" + repr(p)
    score = score + '<br>' + "Recall:" + repr(r)
    score = score + '<br>' + "*******************F1-score:" + repr(2 * p * r /
                                                                   (p + r))
    score = score + '<br>' + "======================="
    score = score + '<br>' + "character count:" + str(len(Spp))
    score = score + '<br>' + "block uncertain rate:" + str(
        (U_score / len(Spp)))

    output = ''
    key = 0
    for line in lines:
        #print (line.encode('utf8'))

        output = output + '<br>' + line
        #print (line)
        key = key + 1

    #for index_m in ypp:
    #  output = output + '<br>' + line

    output = score + '<br>' + output

    return (output)
Beispiel #6
0
def buildCrf(inputtext):
    material = inputtext
    #material = 'data/24s/*'
    #material = "data/sjw/A05*"
    filename = 'model'
    size = 80
    trainportion = 0.9
    dictfile = 'data/vector/24scbow300.txt'
    crfmethod = "l2sgd"  # {‘lbfgs’, ‘l2sgd’, ‘ap’, ‘pa’, ‘arow’}
    charstop = True  # True means label attributes to previous char
    features = 1  # 1=discrete; 2=vectors; 3=both
    random.seed(101)

    #宣告指令式
    "python runcrf.py 'data/sjw/*' 80 data/vector/vectors300.txt 1 1"
    args = sys.argv
    '''
    if len(args)>1:
        material = args[1]
        size = int(args[2])
        dictfile = args[3]
        features = int(args[4])
        charstop = int(args[5])
    '''
    cut = int(size * trainportion)

    #訓練模型名稱
    modelname = filename.replace('/', '').replace(
        '*', '') + str(size) + str(charstop) + ".m"
    print(modelname)
    print("Material:", material)
    print("Size:", size, "entries,", trainportion, "as training")

    print(datetime.datetime.now())

    # Prepare li: list of random lines
    if features > 1:
        vdict = util.readvec(dictfile)  #先處理文本
        print("Dict:", dictfile)
    li = [line for line in util.file_to_lines(glob.glob(material))]  #已經切成陣列
    random.shuffle(li)  #做亂數取樣
    li = li[:size]

    # Prepare data: list of x(char), y(label) sequences
    data = []

    for line in li:
        x, y = util.line_toseq(line, charstop)
        #print(x)
        #print(y[:5])

        #這邊在做文本做gram
        if features == 1:
            d = crf.x_seq_to_features_discrete(x, charstop), y
        elif features == 2:
            d = crf.x_seq_to_features_vector(x, vdict, charstop), y
        elif features == 3:
            d = crf.x_seq_to_features_both(x, vdict, charstop), y
        data.append(d)

    traindata = data[:cut]
    testdata = data[cut:]
    #print(traindata)

    trainer = pycrfsuite.Trainer()
    #print trainer.params()
    #print(traindata[0])
    for t in traindata:
        x, y = t

        trainer.append(x, y)

    trainer.select(crfmethod)  #做訓練
    trainer.set('max_iterations', 10)  #測試迴圈
    #trainer.set('delta',0)
    #print ("!!!!before train", datetime.datetime.now())
    trainer.train(modelname)
    #print ("!!!!after train", datetime.datetime.now())

    tagger = pycrfsuite.Tagger()
    #建立訓練模型檔案
    tagger.open(modelname)
    tagger.dump(modelname + ".txt")

    print(datetime.datetime.now())
    print("Start closed testing...")
    results = []
    print(traindata)
    while traindata:
        x, yref = traindata.pop()
        yout = tagger.tag(x)
        pr = tagger.marginal('S', 0)
        pp = tagger.probability(yout)
        results.append(util.eval(yref, yout, "S"))

    tp, fp, fn, tn = zip(*results)
    tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn)

    p, r = tp / (tp + fp), tp / (tp + fn)
    print("Total tokens in Train Set:", tp + fp + fn + tn)
    print("Total S in REF:", tp + fn)
    print("Total S in OUT:", tp + fp)
    print("Presicion:", p)
    print("Recall:", r)
    print("*******************F1-score:", 2 * p * r / (p + r))
    print("*******************:", pr)
    print("*******************:", pp)
    print("*******************:", yout)
    print(datetime.datetime.now())

    return (modelname)
Beispiel #7
0
li = [line for line in util.file_to_lines(glob.glob(material))]  #已經切成陣列
random.shuffle(li)  #做亂數取樣
print(len(li))
#li = li[:size]

# Prepare data: list of x(char), y(label) sequences
data = []

for line in li:
    x, y = util.line_toseq(line, charstop)
    #print(x)
    #print(y[:5])

    #這邊在做文本做gram
    if features == 1:
        d = crf.x_seq_to_features_discrete(x, charstop, 1), y
    elif features == 2:
        d = crf.x_seq_to_features_vector(x, vdict, charstop), y
    elif features == 3:
        d = crf.x_seq_to_features_both(x, vdict, charstop, 1), y
    data.append(d)

#date = [(['劉','敬','者','齊','人','也','漢','五','年'], ['S', 'S', 'N','S', 'N', 'N','N', 'S', 'N'])]
traindata = data[:cut]
#traindata = date
testdata = data[cut:]

trainer = pycrfsuite.Trainer()
#print trainer.params()
#print(traindata[0])
for t in traindata:
Beispiel #8
0
print "Material:", material

print datetime.datetime.now()

# Prepare li: list of random lines
if features > 1:
    vdict = util.readvec(dictfile)
    print "Dict:", dictfile
li = [line for line in util.file_to_lines(glob.glob(material))]

# Prepare data: list of x(char), y(label) sequences
data = []
for line in li:
    x, y = util.line_toseq(line, charstop)
    if features == 1:
        d = crf.x_seq_to_features_discrete(x, charstop), y
    elif features == 2:
        d = crf.x_seq_to_features_vector(x, vdict, charstop), y
    elif features == 3:
        d = crf.x_seq_to_features_both(x, vdict, charstop), y
    data.append(d)

tagger = pycrfsuite.Tagger()
tagger.open(modelname)

print datetime.datetime.now()
print "Start testing..."
results = []
lines = []
while data:
    xseq, yref = data.pop()