Example #1
0
def f(text):
    sentences = sentence.get_sentences(text)
    doc = []
    for sent in sentences:
        #print "%s" % sent.encode('utf8')
        words = segment.seg(sent)
        words = swfilter.filter(words)
        doc.append(words)
    rank = TextRank(doc)
    rank.solve()                                                                               
    ret = []
    for index in rank.top_index(limit=5):
        ret.append(sentences[index].strip())
Example #2
0
 def handle(self, doc):
     words = segment.seg(doc)
     words = swfilter.filter(words)
     return words
Example #3
0
def text_preprocess(text):
    allowed_words = ['.', ',', '?', '!', '\\', '-', 'n', ';', ' ', 'W']
    for sen_id, sentance in enumerate(text):
        sentance = (sentance).replace('\n', '.')
        sentance = (sentance).replace('\t', '.')
        sentance = (sentance).replace('。', '.')
        sentance = (sentance).replace(',', ',')
        sentance = (sentance).replace('?', '?')
        text[sen_id] = sentance
    pattern1 = re.compile('[a-zA-Z]+')
    pattern2 = re.compile(r'\[.*?\]')
    pattern3 = re.compile(r'\.+')
    pattern4 = re.compile(r' +')
    pattern5 = re.compile(r'…+')
    pattern6 = re.compile(r'\\r')
    pattern7 = re.compile('[0-9]+')
    pattern8 = re.compile(r',+')
    pattern9 = re.compile(r'《.*?》')
    pattern10 = re.compile(r'。+')
    pattern11 = re.compile(r',+')
    pattern12 = re.compile(r'【.*?】')
    text = [pattern1.sub("", lines) for lines in text]  # 去掉英语字符
    text = [pattern2.sub("", lines) for lines in text]  # 去掉[]中的部分
    text = [pattern3.sub(".", lines) for lines in text]  # 去掉...
    text = [pattern4.sub(".", lines) for lines in text]  # 去掉空格
    text = [pattern5.sub(".", lines) for lines in text]  # 去掉…
    text = [pattern6.sub("", lines) for lines in text]  # 去掉空格\r
    text = [pattern7.sub("n", lines) for lines in text]  # 用n代表数字
    text = [pattern8.sub(",", lines) for lines in text]  # 去掉,,,
    text = [pattern9.sub("", lines) for lines in text]  # 去掉《》中的部分
    text = [pattern10.sub(".", lines) for lines in text]  # 去掉。。。
    text = [pattern11.sub(",", lines) for lines in text]  # 去掉,,,
    text = [pattern12.sub("", lines) for lines in text]  # 去掉【】中
    text = [pattern3.sub(".", lines) for lines in text]  # 去掉...
    text_id = 0
    max_len = 20
    processed_text = []
    while (True):
        if text_id >= text.__len__():
            break
        if text[text_id].__len__() <= max_len:
            text_id += 1
            continue
        for i in range(text[text_id].__len__() - max_len):
            if text[text_id][i + max_len] in [',', '.', '?']:
                text.append(text[text_id][i + max_len + 1:])
                text[text_id] = text[text_id][:i + max_len]
                text_id += 1
                break
            if i == text[text_id].__len__() - max_len - 1:
                text_id += 1
                break
            if i == sentence_min_len:
                break_sign = 0
                for j in range(max_len - 1, -1, -1):
                    if text[text_id][j] in [',', '.', '。', ',', '\n']:
                        text.append(text[text_id][j + 1:])
                        text[text_id] = text[text_id][:j + 1]
                        text_id += 1
                        break_sign = 1
                        break
                if break_sign:
                    break
                else:
                    # text.append(text[text_id][i + max_len + 1:])
                    text[text_id] = text[text_id][:i + max_len + 1]
                    text_id += 1
                    break

    for sentance in text:
        if if_segment():
            sentance_seg = segment.seg(sentance)
        else:
            sentance_seg = sentance
        sentance_seg2 = ['\start']
        for word in sentance_seg:
            if word in allowed_words or is_chinese(word):
                sentance_seg2.append(word)
        sentance_seg2.append('\end')
        processed_text.append(sentance_seg2)
    return processed_text
Example #4
0
    return cm,accu


def accuracy(cm):
    row_num = len(cm)
    right_num = 0
    for i in range(row_num):
        right_num += cm[i][i]
    total_num = sum(cm.sum(0))

    return float(right_num) / float(total_num)


if __name__ == "__main__":

    filename = "/home/chi/PycharmProjects/Seg_AR/data/example"
    segment.SENSORLIST = tools.getSensorList(filename)
    true_labels = annotation.origin_annotation(filename)
    seq_index = []
    with open(filename, 'r') as fr:
        for line in fr:
            row = line.split()
            seq_index.append(segment.SENSORLIST.index(row[2]))
    sizes=range(10,150,5)
    thetas = [0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5]
    for size in sizes:
        for theta in thetas:
            border_1 = segment.seg(seq_index, size, theta)
            vote_labels = annotation.seg_labels(true_labels,border_1)
            cm,accu = confusionmatrix(vote_labels,true_labels)
            print accu
def select(img):
    global ims, para, contour, ps, imggg, C, parts, index, para1, ims1, ps1

    # Segmentation
    im, cl, d = segment.seg(img)

    # Setting values of the global variables
    ps = cl.copy()
    imggg = im
    j = imggg.copy()
    ims = imggg.copy()
    C = np.zeros(ims.shape, np.uint8)
    contour = None
    para = True

    # Selction of contour
    cv2.namedWindow('image', cv2.WINDOW_NORMAL)
    cv2.resizeWindow('image', (int(im.shape[1] / 2), int(im.shape[0] / 2)))
    cv2.moveWindow('image', 40, 0)
    cv2.setMouseCallback('image', draw_c)

    while (1):
        cv2.imshow('image', ims)
        k = cv2.waitKey(1) & 0xFF
        if k == ord('q'):
            break
    cv2.destroyAllWindows()

    # Finding all the points on the contour
    Cp = cv2.cvtColor(C, cv2.COLOR_BGR2GRAY)
    point = cv2.findNonZero(Cp)

    # Taking as input the number of parts to which the contour should be divided
    num_parts = input(
        "In how many parts you want to divide the selected vessel (Please enter an integer <=5):  "
    )
    #print(point.shape)
    print("Select the required part and press Q.")

    parts = np.array_split(point, int(num_parts), axis=0)
    colbgr = [(193, 182, 255), (255, 0, 102), (255, 128, 0), (0, 255, 255),
              (10, 200, 10)]
    Cparts = np.zeros(C.shape)

    for i, part in enumerate(parts):
        if (i >= 5):
            cv2.drawContours(Cparts, part, -1, (255, 255, 255), 5)
        else:
            cv2.drawContours(Cparts, part, -1, colbgr[i], 5)

    # Global variables
    para1 = True
    ps1 = Cparts
    ims1 = ps1.copy()
    parts = np.array_split(point, int(num_parts), axis=0)

    # At max supports 10 parts
    if (int(num_parts) <= 1 or int(num_parts) > 10):
        num_parts = 1
        C_parts_selected = parts[0]

        return (C_parts_selected, d)

    # Selecting the part if number of parts <=10
    cv2.namedWindow('image1', cv2.WINDOW_NORMAL)
    cv2.resizeWindow('image1', (int(im.shape[1] / 2), int(im.shape[0] / 2)))
    cv2.moveWindow('image1', 40, 0)
    cv2.setMouseCallback('image1', draw_part)

    while (1):
        cv2.imshow('image1', ims1)
        k = cv2.waitKey(1) & 0xFF
        if k == ord('q'):
            break
    cv2.destroyAllWindows()

    C_parts_selected = parts[index]

    return C_parts_selected, d
Example #6
0
 def handle(self, doc):
     words = segment.seg(doc)
     #words = swfilter.filter(list(words))
     return list(words)
Example #7
0
#!/usr/bin/env python3

import segment as sg

inputSylla = ""
output = ""



#print(sg.seg("ektygcmqcqlg"))
#print(sg.seg("wygdgdrndhqa"))

inputSylla = input()
while inputSylla != "#":
	ans = sg.seg(inputSylla)
	for i in range(0,len(ans)-1):
		print(ans[i],end="")
	print("")
	inputSylla = input()
	


Example #8
0
#print(sg.seg("wygdgdrndhqa"))

while True:
	#read input
	if inputSylla[-1] == "#":
		inputSylla = ""
	inp_ch = input("Please input char:")
	if inp_ch != "" and inp_ch[0].isdigit():
		#if input is a number
		print("Choose :" + on_screen[int(inp_ch)][0] + str(on_screen[int(inp_ch)][1]))
		output += on_screen[int(inp_ch)][0]
		inputSylla = inputSylla[on_screen[int(inp_ch)][1]:]
	else:	
		#update input Sylla
		inputSylla += inp_ch
	print(sg.seg(inputSylla))
	
	#max incorrection pattern
	screen = []
	
	it = 4
	counter = 0
	flag = True
	while it >= 0 and flag:
		tp_list = []
		for wubi_line in sg.wubi:
			if wubi_line[0].startswith(inputSylla[0:it]):
				wubi_code = int(wubi_line[1])
				if wubi_code in sg.uni_map:
					tp_list.append([sg.wd_map[wubi_code],it,sg.uni_map[wubi_code]])
				counter += 1
Example #9
0
pg.pg_init()

# 将教育经历的两个属性合并
'''for ele in segment.seg(pg.get_edu()):
    print len(ele)'''
# 测试
'''for mlist in segment.seg(pg.get_edu()):
    for ele in mlist:
        for x in ele:
            output_file.write(x+' ')
        output_file.write('\t||\t')
    output_file.write('\n')
'''
raw = list()
people = pg.get_edu(0, 0)
raw = map(lambda x: x[0] + x[1], segment.seg(people))

topic_model.build_lsi(raw)


'''topic_model._build_corpus(raw)

tfidf = topic_model._build_tfidf()
corpus_tfidf = tfidf[topic_model.corpus]

for ele in corpus_tfidf:
    print ele
'''

'''for ele in topic_model.dictionary.token2id:
    output_file.write(ele+' ')'''
Example #10
0
# -*- encoding:"utf-8"-*-
# coding=utf-8

import tools
import segment

filename = "G:/Seg_AR/data/annotated"
segment.SENSORLIST = tools.getSensorList(filename)

seq_index = []
with open(filename, "r") as fr:
    for line in fr:
        row = line.split()
        seq_index.append(segment.SENSORLIST.index(row[2]))
print len(seq_index)

borders = segment.seg(seq_index)
print len(borders)
print borders
Example #11
0
from __future__ import absolute_import

import codecs
import segment
import pg

output_file = codecs.open("data/seg_data1.txt", 'wb', encoding='utf-8')
pg.pg_init()

# print segment.seg(pg.get_edu())
bias = 1000
counter = 0
raw = '1'
while len(raw) != 0:
    raw = pg.get_edu(bias, counter)
    for mlist in segment.seg(raw):
        for ele in mlist:
            for x in ele:
                output_file.write(x+' ')
            output_file.write('\t')
        output_file.write('\n')
    counter += 1

'''raw = pg.get_edu(bias, counter)
for mlist in segment.seg(raw):
    for ele in mlist:
        for x in ele:
            output_file.write(x+' ')
        output_file.write('\t||\t')
    output_file.write('\n')'''