def dataary(rowdata):
    data_x = []
    data_y = []
    data = []
    lineary = ''
    for _line in li:
        lineary += _line

    lineary = lineary.replace(":", "")
    lineary = lineary.replace("、", "")
    lineary = lineary.replace("!", "")
    lineary = lineary.replace(".", "")
    lineary = lineary.replace("?", "")
    lineary = lineary.replace("》", "")
    lineary = lineary.replace("《", "")

    x, y = util.line_toseq(lineary.pop(), charstop)
    data_x.append(x)
    data_y.append(y)
    '''
    while rowdata:
        x, y = util.line_toseq(rowdata.pop(), charstop)
        data_x.append(x)
        data_y.append(y)
    '''
    data.append([data_x, data_y])
    return data
Ejemplo n.º 2
0
def dataary(li, gram, features, vdict):
    data = []
    lineary = ''
    for _line in li:
        lineary += _line

    lineary = lineary.replace(":", "")
    lineary = lineary.replace("、", "")
    lineary = lineary.replace("!", "")
    lineary = lineary.replace(".", "")
    lineary = lineary.replace("?", "")
    lineary = lineary.replace("》", "")
    lineary = lineary.replace("《", "")
    x, y = util.line_toseq(lineary, charstop)
    del y[0]
    y = y + ['N']
    #這邊在做文本做gram
    if features == 1:
        d = crf.x_seq_to_features_discrete(x, charstop, gram), y
    elif features == 2:
        d = crf.x_seq_to_features_vector(x, vdict, charstop), y
    elif features == 3:
        d = crf.x_seq_to_features_both(x, vdict, charstop, gram), y
    #d = crf.x_seq_to_features_discrete(x, charstop,gram), y
    data.append(d)
    return data
Ejemplo n.º 3
0
def dataary(rowdata):
    data_x = []
    data_y = []
    data = []
    while rowdata:
        x, y = util.line_toseq(rowdata.pop(), charstop)
        data_x.append(x)
        data_y.append(y)

    data.append([data_x, data_y])
    return data
Ejemplo n.º 4
0
def dataary(li, gram):
    data = []
    for line in li:
        x, y = util.line_toseq(line, charstop)
        #print(x)
        #print(y[:5])

        #這邊在做文本做gram
        d = crf.x_seq_to_features_discrete(x, charstop, gram), y
        data.append(d)
    return data
def dataary(li,gram,features,vdict):
    data = []
    for line in li:          
        x, y = util.line_toseq(line, charstop)
        #print(x)
        #print(y[:5])
        #這邊在做文本做gram
        if features == 1:
            d = crf.x_seq_to_features_discrete(x, charstop,gram), y
        elif features == 2:
            d = crf.x_seq_to_features_vector(x, vdict, charstop), y
        elif features == 3:
            d = crf.x_seq_to_features_both(x, vdict, charstop,gram), y
        #d = crf.x_seq_to_features_discrete(x, charstop,gram), y
        data.append(d)
    return data
Ejemplo n.º 6
0
"python cpr.py 'qualitative/allover-sjw-gold.txt' 'qualitative/allover-sjw-me.txt' 1"
args = sys.argv
if len(args) > 1:
    material1 = args[1]
    material2 = args[2]
    charstop = int(args[3])

# Prepare li: list of random lines
print "Reading from files..."
gold = [line for line in util.file_to_lines(glob.glob(material1))]
out = [line for line in util.file_to_lines(glob.glob(material2))]

golddata = []
for line in gold:
    golddata.append(util.line_toseq(line, charstop))

outdata = []
for line in out:
    outdata.append(util.line_toseq(line, charstop))

# testdata shape: [([x1, x2, ...],[y1,y2,...]),([],[])]

results = []
assert len(golddata) == len(outdata)
for i in range(len(golddata)):
    try:
        yref = golddata[i][1]
        yout = outdata[i][1]
        results.append(util.eval(yref, yout, "S"))
    except AssertionError:
Ejemplo n.º 7
0
random.shuffle(li)
li = li[:size]

print "Preparing dictionaries..."
if dense: vdict = util.lstmvec(dictfile)
else: charset = util.make_charset(li,7)

print "Preparing datasets..."

dataset_train = li[:cut1]
dataset_validate = li[cut1:cut2]
dataset_test = li[cut2:]

dataset = []
while dataset_train:
    x, y = util.line_toseq(dataset_train.pop(), charstop)
    if dense: dataset.append(util.seq_to_densevec(x, y, vdict))
    else: dataset.append(util.seq_to_sparsevec(x,y,charset))
    if not len(dataset_train)%1000: print "len(dataset_train)", len(dataset_train)
dataset_train = dataset

dataset = []
while dataset_validate:
    x, y = util.line_toseq(dataset_validate.pop(), charstop)
    if dense: dataset.append(util.seq_to_densevec(x, y, vdict))
    else: dataset.append(util.seq_to_sparsevec(x,y,charset))
    if not len(dataset_validate)%1000: print "len(dataset_validate)", len(dataset_validate)
dataset_validate = dataset


#sys.exit()
Ejemplo n.º 8
0
    print('train:', trainidx)
    print('test:', testidx)

    traindataary = []
    testdata = []
    for i in trainidx:
        traindataary = numpy.hstack((traindataary, rowdata[i]))
    testdataary = []
    for i in testidx:
        testdataary = rowdata[i]
        testdata.append(testdataary)
    #print(testdata)
    traindata = []
    #資料處理
    for line in traindataary:
        x, y = util.line_toseq(line, charstop)
        traindata.append(zip(x, y))

    trainer = hmm.HiddenMarkovModelTrainer()
    tagger = trainer.train_supervised(traindata)
    #建立訓練模型檔案

    #開始測試
    print(datetime.datetime.now())
    print("Start closed testing...")
    results = []
    f.write(str(log_text))

    for j in range(len(testdata)):
        #第j區塊
        blocktext = j + 1
li = li[:size]

print("Preparing dictionaries...")
if dense: vdict = util.lstmvec(dictfile)
else: charset = util.make_charset(li, 7)

print("Preparing datasets...")
dataset_train = li[:cut1]
dataset_validate = li[cut1:cut2]
dataset_test = li[cut2:]

dataset = []
print(type(dataset_train))

while dataset_train:
    x, y = util.line_toseq(dataset_train.pop(), charstop)
    if dense: dataset.append(util.seq_to_densevec(x, y, vdict))
    else: dataset.append(util.seq_to_sparsevec(x, y, charset))
    if not len(dataset_train) % 1000:
        print("len(dataset_train)", len(dataset_train))
dataset_train = dataset

dataset = []
while dataset_validate:
    x, y = util.line_toseq(dataset_validate.pop(), charstop)
    if dense: dataset.append(util.seq_to_densevec(x, y, vdict))
    else: dataset.append(util.seq_to_sparsevec(x, y, charset))
    if not len(dataset_validate) % 1000:
        print("len(dataset_validate)", len(dataset_validate))
dataset_validate = dataset
Ejemplo n.º 10
0
def predic_unscore_api(inputtext):
    charstop = True  # True means label attributes to previous char
    features = 3  # 1=discrete; 2=vectors; 3=both
    dictfile = 'vector/24scbow50.txt'
    modelname = 'datalunyu5001.m'
    vdict = util.readvec(dictfile)
    inputtext = inputtext
    #li = [line for line in util.text_to_lines(inputtext)]
    li = util.text_to_lines(inputtext)

    print(li)
    data = []
    for line in li:
        x, y = util.line_toseq(line, charstop)
        print(x)
        if features == 1:
            d = crf.x_seq_to_features_discrete(x, charstop), y
        elif features == 2:
            d = crf.x_seq_to_features_vector(x, vdict, charstop), y
        elif features == 3:
            d = crf.x_seq_to_features_both(x, vdict, charstop), y
        data.append(d)

    tagger = pycrfsuite.Tagger()
    tagger.open(modelname)
    print("Start testing...")
    results = []
    lines = []
    Spp = []
    Npp = []
    out = []
    #while data:
    for index in range(len(data)):
        print(len(data))
        xseq, yref = data.pop(0)
        yout = tagger.tag(xseq)
        sp = 0
        np = 0
        for i in range(len(yout)):
            sp = tagger.marginal('S', i)
            Spp.append(sp)  #S標記的機率
            print(sp)
            np = tagger.marginal('N', i)
            Npp.append(np)  #Nㄅ標記的機率
            print(np)
        results.append(util.eval(yref, yout, "S"))
        lines.append(
            util.seq_to_line([x['gs0'] for x in xseq], yout, charstop, Spp,
                             Npp))
        #print(util.seq_to_line([x['gs0'] for x in xseq], (str(sp) +'/'+ str(np)),charstop))
        out.append(yout)

    tp, fp, fn, tn = zip(*results)
    tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn)

    p, r = tp / (tp + fp), tp / (tp + fn)
    score = ''
    score = score + '<br>' + "Total tokens in Test Set:" + repr(tp + fp + fn +
                                                                tn)
    score = score + '<br>' + "Total S in REF:" + repr(tp + fn)
    score = score + '<br>' + "Total S in OUT:" + repr(tp + fp)
    score = score + '<br>' + "Presicion:" + repr(p)
    score = score + '<br>' + "Recall:" + repr(r)
    score = score + '<br>' + "*******************F1-score:" + repr(2 * p * r /
                                                                   (p + r))

    output = ''
    print(lines)

    for line in lines:
        #line = unquote(line)
        print("output:")
        print(line.encode('utf8'))
        #output = output + '<br>' + line
        output += line
        print(line)
    output = score + '<br>' + output

    #output = jsonify({'str': output})

    return (out)
Ejemplo n.º 11
0
def predic():
    charstop = True  # True means label attributes to previous char
    features = 3  # 1=discrete; 2=vectors; 3=both
    dictfile = 'vector/24scbow50.txt'
    modelname = 'datalunyu5001.m'
    vdict = util.readvec(dictfile)
    inputtext = request.form.get('input_text', '')
    #li = [line for line in util.text_to_lines(inputtext)]
    li = util.text_to_lines(inputtext)

    print(li)
    data = []
    for line in li:
        x, y = util.line_toseq(line, charstop)
        print(x)
        if features == 1:
            d = crf.x_seq_to_features_discrete(x, charstop), y
        elif features == 2:
            d = crf.x_seq_to_features_vector(x, vdict, charstop), y
        elif features == 3:
            d = crf.x_seq_to_features_both(x, vdict, charstop), y
        data.append(d)

    tagger = pycrfsuite.Tagger()
    tagger.open(modelname)
    print("Start testing...")
    results = []
    lines = []
    Spp = []
    Npp = []
    #while data:
    for index in range(len(data)):
        print(len(data))
        xseq, yref = data.pop(0)
        yout = tagger.tag(xseq)
        sp = 0
        np = 0
        for i in range(len(yout)):
            sp = tagger.marginal('S', i)
            Spp.append(sp)  #S標記的機率
            print(sp)
            np = tagger.marginal('N', i)
            Npp.append(np)  #Nㄅ標記的機率
            print(np)
        results.append(util.eval(yref, yout, "S"))
        lines.append(
            util.seq_to_line([x['gs0'] for x in xseq], yout, charstop, Spp,
                             Npp))
        #print(util.seq_to_line([x['gs0'] for x in xseq], (str(sp) +'/'+ str(np)),charstop))

    U_score = 0
    p_Scount = 0
    p_Ncount = 0
    for i in range(len(Spp)):
        _s = 0
        if Spp[i] > Npp[i]:
            _s = Spp[i]
        else:
            _s = Npp[i]
        _s = (_s - 0.5) * 10
        U_score = U_score + _s
        p_Scount = p_Scount + Spp[i]
        p_Ncount = p_Ncount + Npp[i]

    tp, fp, fn, tn = zip(*results)
    tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn)

    p, r = tp / (tp + fp), tp / (tp + fn)
    score = ''
    score = score + '<br>' + "Total tokens in Test Set:" + repr(tp + fp + fn +
                                                                tn)
    score = score + '<br>' + "Total S in REF:" + repr(tp + fn)
    score = score + '<br>' + "Total S in OUT:" + repr(tp + fp)
    score = score + '<br>' + "Presicion:" + repr(p)
    score = score + '<br>' + "Recall:" + repr(r)
    score = score + '<br>' + "*******************F1-score:" + repr(2 * p * r /
                                                                   (p + r))
    score = score + '<br>' + "======================="
    score = score + '<br>' + "character count:" + str(len(Spp))
    score = score + '<br>' + "block uncertain rate:" + str(
        (U_score / len(Spp)))

    output = ''
    key = 0
    for line in lines:
        #print (line.encode('utf8'))

        output = output + '<br>' + line
        #print (line)
        key = key + 1

    #for index_m in ypp:
    #  output = output + '<br>' + line

    output = score + '<br>' + output

    return (output)
Ejemplo n.º 12
0
def buildCrf(inputtext):
    material = inputtext
    #material = 'data/24s/*'
    #material = "data/sjw/A05*"
    filename = 'model'
    size = 80
    trainportion = 0.9
    dictfile = 'data/vector/24scbow300.txt'
    crfmethod = "l2sgd"  # {‘lbfgs’, ‘l2sgd’, ‘ap’, ‘pa’, ‘arow’}
    charstop = True  # True means label attributes to previous char
    features = 1  # 1=discrete; 2=vectors; 3=both
    random.seed(101)

    #宣告指令式
    "python runcrf.py 'data/sjw/*' 80 data/vector/vectors300.txt 1 1"
    args = sys.argv
    '''
    if len(args)>1:
        material = args[1]
        size = int(args[2])
        dictfile = args[3]
        features = int(args[4])
        charstop = int(args[5])
    '''
    cut = int(size * trainportion)

    #訓練模型名稱
    modelname = filename.replace('/', '').replace(
        '*', '') + str(size) + str(charstop) + ".m"
    print(modelname)
    print("Material:", material)
    print("Size:", size, "entries,", trainportion, "as training")

    print(datetime.datetime.now())

    # Prepare li: list of random lines
    if features > 1:
        vdict = util.readvec(dictfile)  #先處理文本
        print("Dict:", dictfile)
    li = [line for line in util.file_to_lines(glob.glob(material))]  #已經切成陣列
    random.shuffle(li)  #做亂數取樣
    li = li[:size]

    # Prepare data: list of x(char), y(label) sequences
    data = []

    for line in li:
        x, y = util.line_toseq(line, charstop)
        #print(x)
        #print(y[:5])

        #這邊在做文本做gram
        if features == 1:
            d = crf.x_seq_to_features_discrete(x, charstop), y
        elif features == 2:
            d = crf.x_seq_to_features_vector(x, vdict, charstop), y
        elif features == 3:
            d = crf.x_seq_to_features_both(x, vdict, charstop), y
        data.append(d)

    traindata = data[:cut]
    testdata = data[cut:]
    #print(traindata)

    trainer = pycrfsuite.Trainer()
    #print trainer.params()
    #print(traindata[0])
    for t in traindata:
        x, y = t

        trainer.append(x, y)

    trainer.select(crfmethod)  #做訓練
    trainer.set('max_iterations', 10)  #測試迴圈
    #trainer.set('delta',0)
    #print ("!!!!before train", datetime.datetime.now())
    trainer.train(modelname)
    #print ("!!!!after train", datetime.datetime.now())

    tagger = pycrfsuite.Tagger()
    #建立訓練模型檔案
    tagger.open(modelname)
    tagger.dump(modelname + ".txt")

    print(datetime.datetime.now())
    print("Start closed testing...")
    results = []
    print(traindata)
    while traindata:
        x, yref = traindata.pop()
        yout = tagger.tag(x)
        pr = tagger.marginal('S', 0)
        pp = tagger.probability(yout)
        results.append(util.eval(yref, yout, "S"))

    tp, fp, fn, tn = zip(*results)
    tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn)

    p, r = tp / (tp + fp), tp / (tp + fn)
    print("Total tokens in Train Set:", tp + fp + fn + tn)
    print("Total S in REF:", tp + fn)
    print("Total S in OUT:", tp + fp)
    print("Presicion:", p)
    print("Recall:", r)
    print("*******************F1-score:", 2 * p * r / (p + r))
    print("*******************:", pr)
    print("*******************:", pp)
    print("*******************:", yout)
    print(datetime.datetime.now())

    return (modelname)
Ejemplo n.º 13
0
# Prepare li: list of random lines
print "Reading from files..."
li = [line for line in util.file_to_lines(glob.glob(material))]
random.shuffle(li)
li = li[:size]

# Prepare data: list of x(char), y(label) sequences
print "Prepare list of sequences..."

closetestdata = li[:cut]
testdata = li[cut:]

traindata = []
for line in closetestdata:
    x, y = util.line_toseq(line, charstop)
    traindata.append(zip(x,y))

# traindata shape: [[(x,y),(x,y), ...],[],[],...]
# testdata shape: [([x1, x2, ...],[y1,y2,...]),([],[])]

stt = datetime.datetime.now()
print "Start training...", stt
hmmtagger = nt.hmm.HiddenMarkovModelTagger.train(traindata)


print "################# Training took:", datetime.datetime.now()-stt
results = []
for line in testdata:
    x, yref = util.line_toseq(line, charstop)
    out = hmmtagger.tag(x)
Ejemplo n.º 14
0
"python cpr.py 'qualitative/allover-sjw-gold.txt' 'qualitative/allover-sjw-me.txt' 1"
args = sys.argv
if len(args)>1:
    material1 = args[1]
    material2 = args[2]
    charstop = int(args[3])

# Prepare li: list of random lines
print "Reading from files..."
gold = [line for line in util.file_to_lines(glob.glob(material1))]
out = [line for line in util.file_to_lines(glob.glob(material2))]


golddata = []
for line in gold:
    golddata.append(util.line_toseq(line, charstop))

outdata = []
for line in out:
    outdata.append(util.line_toseq(line, charstop))

# testdata shape: [([x1, x2, ...],[y1,y2,...]),([],[])]

results = []
assert len(golddata)==len(outdata)
for i in range(len(golddata)):
    try:
        yref = golddata[i][1]
        yout = outdata[i][1]
        results.append(util.eval(yref, yout, "S"))
    except AssertionError: