def ner(sentence, format=None): """ Location and classify named entities in text Parameters ========== sentence: {unicode, str} raw sentence Returns ======= tokens: list of tuple with word, pos tag, chunking tag, ner tag tagged sentence Examples -------- >>> # -*- coding: utf-8 -*- >>> from underthesea import ner >>> sentence = "Ông Putin ca ngợi những thành tựu vĩ đại của Liên Xô" >>> ner(sentence) [('Ông', 'Nc', 'B-NP', 'O'), ('Putin', 'Np', 'B-NP', 'B-PER'), ('ca ngợi', 'V', 'B-VP', 'O'), ('những', 'L', 'B-NP', 'O'), ('thành tựu', 'N', 'B-NP', 'O'), ('vĩ đại', 'A', 'B-AP', 'O'), ('của', 'E', 'B-PP', 'O'), ('Liên Xô', 'Np', 'B-NP', 'B-LOC')] """ sentence = chunk(sentence) crf_model = CRFNERPredictor.Instance() result = crf_model.predict(sentence, format) return result
def input_lstm_char(text): sentences = nltk.sent_tokenize(text) idx_words = [words2idx(s) for s in sentences] idx_postags = [ postag2idx([word[1] for word in underthesea.chunk(s)]) for s in sentences ] idx_chars = [chars2idx(s) for s in sentences] # print(idx_postags) # print(idx_words[0]) x_words = pad_sequences(maxlen=max_len, sequences=idx_words, value=dict_words['PAD'], padding='post', truncating='post') x_postags = pad_sequences(maxlen=max_len, sequences=idx_postags, value=dict_pos_tags['PAD'], padding='post', truncating='post') x_chars = pad_sequences(maxlen=max_len * max_len_char, sequences=idx_chars, value=dict_chars['PAD'], padding='post', truncating='post') # print(x_words.shape) # print(x_postags.shape) return x_words, x_chars
def chunking(request): result = {} try: text = json.loads(request.body.decode("utf-8"))["text"] tags = uts.chunk(text) result["output"] = tags except: result = {"error": "Bad request!"} return JsonResponse(result)
def ner(sentence, format=None): """ part of speech tagging :param unicode|str sentence: raw sentence :return: ner tagged sentence :rtype: list """ sentence = chunk(sentence) crf_model = CRFNERPredictor.Instance() result = crf_model.predict(sentence, format) return result
def test_accuracy(self): test_dir = join(dirname(__file__), "samples") files = listdir(test_dir) ids = [f.split(".")[0] for f in files] for id in ids: file = join(test_dir, "%s.txt" % id) sentence = load_input(file) actual = chunk(sentence) expected = load_output(file) if actual != expected: print("Fail {}".format(id)) save_temp(id, actual) self.assertEqual(actual, expected)
def get_dict_pos_tag(paragraph): words = underthesea.chunk(paragraph) pos_tags = [word[1] for word in words] pos_tags = set(pos_tags) try: with open('data/dict/postag.txt', 'r') as f: tags = f.read().split('\n') pos_tags.update(tags) except FileNotFoundError as e: print(e) with open('data/dict/postag.txt', 'w') as f: pos_tags = sorted(pos_tags) for t in pos_tags[:-1]: f.write(t + '\n') f.write(pos_tags[-1])
def tokenize(para): # split paragraph to sentences try: sentences = para.split('. ') except Exception as e: print(sentences) # add '.' after split paragraph for idx, sent in enumerate(sentences[:-1]): sentences[idx] = sent + "." # tokenize sentence for idx, sent in enumerate(sentences): sentences[idx] = underthesea.chunk(sent) for i in range(len(sentences[idx])): sentences[idx][i] += tuple('O') return sentences
def underthesea_annotate(self, text, mode): if mode == 'sent_tokenize': return sent_tokenize(text) elif mode == 'word_tokenize': return word_tokenize(text) elif mode == 'pos_tag': return pos_tag(text) elif mode == 'chunk': return chunk(text) elif mode == 'ner': return ner(text) elif mode == 'classify': return classify(text) elif mode == 'sentiment': return sentiment(text) else: raise Exception("Wrong request, please check your request")
# s3 = "\r\n 7.6\r\n " # print(len(s3)) # s3.strip("\n\r\n") # print(s3) # print(s3[38:len(s3) - 32]) # string.strip(" ") # print(len(string[43:89])) # print(string[43:len(string) - 39]) # print(s2[43:len(s2) - 39]) # print(string.find("ĐN")) # 43:87 from underthesea import ner, chunk, word_tokenize text = "176 Nguyễn Chí Thanh" # print(ner(text)) # print(word_tokenize(text)) rs = chunk(text) print(rs) mon = [] duong = [] checkduong = ['Đường', 'đường'] # print(type(rs)) for i in range(len(rs)): if rs[i][1] == 'Np' and rs[i-1][0] in checkduong: duong.append(rs[i][0]) if rs[i][1] == 'Np' and str(rs[i-1][0]).isnumeric(): soduong = rs[i-1][0] + " " + rs[i][0] duong.append(soduong) elif rs[i][1] == 'Np': mon.append(rs[i][0]) print(mon) print(duong)
def test_simple_cases(self): sentence = u"" actual = chunk(sentence) expected = [] self.assertEqual(actual, expected)
def test_accuracy(self): output = chunk( u"Tổng Bí thư: Ai trót để tay nhúng chàm thì hãy sớm tự gột rửa") self.assertEqual(len(output), 13)
def search(request): search = str(request.GET.get("search")) # print(search) rs = chunk(search) print(rs) mon = [] duong = [] quan = [] checkduong = ['Đường', 'đường'] checkquan = ['Quận', 'quận'] for i in range(len(rs)): if rs[i][1] in ['Np', 'N', 'M']: if i == 0: if rs[i][1] in ['Np', 'N', 'M']: if len(rs) > 1 and rs[i + 1][1] == 'E': mon.append(rs[i][0]) else: # print("i: ",i) if rs[i - 1][0] in checkquan: quan.append(rs[i][0]) elif rs[i - 1][0] in checkduong: duong.append(rs[i][0]) elif str(rs[i - 1][0]).isnumeric(): soduong = rs[i - 1][0] + " " + rs[i][0] duong.append(soduong) elif rs[i - 1][1] == 'V' or rs[i + 1][1] == 'E': mon.append(rs[i][0]) # if rs[i][1] in ['Np','N','M'] and rs[i-1][0] in checkquan: # quan.append(rs[i][0]) # elif rs[i][1] in ['Np','N','M'] and rs[i-1][0] in checkduong: # duong.append(rs[i][0]) # elif rs[i][1] in ['Np','N','M'] and str(rs[i-1][0]).isnumeric(): # soduong = rs[i-1][0] + " " + rs[i][0] # duong.append(soduong) # elif rs[i][1] in ['Np','N','M'] and (rs[i+1][1] == 'E' or rs[i-1][1] == 'V'): # mon.append(rs[i][0]) print("Quận: ", quan) print("Đường: ", duong) print("Món: ", mon) # return JsonResponse({ # "data": [] # }) dt = [] # g = rdflib.Graph() # g = g.parse("hnag/static/hnag/restaurants.xml", format="xml") g = SPARQLWrapper("http://localhost:7200/repositories/HNAG") if (len(quan) > 0): print(mon) print(quan) regex = """""" for i in range(len(mon)): regex += """FILTER regex(?name,""" + """'""" + mon[ i] + """','i').""" for i in range(len(quan)): regex += """FILTER regex(?districtName,""" + """'""" + quan[ i] + """','i').""" print(regex) queryString = """ PREFIX res: <http://www.hnag.com/> SELECT DISTINCT * WHERE { ?place res:name ?name. ?place res:url ?url. ?place res:rating ?rating. ?place res:image ?image. ?place res:id ?id. ?place res:address ?address. ?address res:district ?district. ?district res:name ?districtName """ + regex + """ } ORDER BY DESC(?rating) """ print(queryString) g.setQuery(queryString) elif len(duong) > 0: print(mon) print(duong) regex = """""" for i in range(len(mon)): regex += """FILTER regex(?name,""" + """'""" + mon[ i] + """','i').""" for i in range(len(duong)): regex += """FILTER regex(?street,""" + """'""" + duong[ i] + """','i').""" print(regex) queryString = """ PREFIX res: <http://www.hnag.com/> SELECT DISTINCT ?name ?address ?url ?rating ?image ?id WHERE { ?place res:name ?name. ?place res:address ?address. ?address res:street ?street. ?place res:url ?url. ?place res:rating ?rating. ?place res:image ?image. ?place res:id ?id. """ + regex + """ } ORDER BY DESC(?rating) """ print(queryString) g.setQuery(queryString) elif len(mon) == 0: queryString = """ PREFIX res: <http://www.hnag.com/> SELECT DISTINCT ?name ?address ?url ?rating ?image ?id WHERE { ?place res:name ?name. ?place res:address ?address. ?place res:url ?url. ?place res:rating ?rating. ?place res:image ?image. ?place res:id ?id. FILTER regex(?name,""" + """'""" + search + """','i'). } ORDER BY DESC(?rating) """ print(queryString) g.setQuery(queryString) else: queryString = """ PREFIX res: <http://www.hnag.com/> SELECT DISTINCT ?name ?address ?url ?rating ?image ?id WHERE { ?place res:name ?name. ?place res:address ?address. ?place res:url ?url. ?place res:rating ?rating. ?place res:image ?image. ?place res:id ?id. FILTER regex(?name,""" + """'""" + mon[ 0] + """','i'). } ORDER BY DESC(?rating) """ print(queryString) g.setQuery(queryString) g.setReturnFormat(JSON) results = g.query().convert() for row in results["results"]["bindings"]: print(row["id"]["value"]) lJson = { "url": row["url"]["value"], "name": row["name"]["value"], "rate": row["rating"]["value"], "image": row["image"]["value"] } dt.append(lJson) return JsonResponse({"posts": dt})
from src.eval import whole_word_position from underthesea import word_sent, pos_tag, chunk test = "Đối với các chuyên khoa khác như : Phẩu thuật tổng quát ( nội trú 5 năm ) , sản ( nội trú 5 năm ) , chấn thương chỉnh hình ( nội trú 5 năm ) . Và chuyên sâu mỗi chuyên khoa tầm ( 1 - 3 năm tùy chuyên khoa ) . Nói chung cũng tầm 15 - 16 năm ( cho lầm sàn , chưa kể Ph.D )" print(word_sent(test)) print(chunk(test)) from underthesea.word_sent.model_crf import CRFModel
def ner(sentence): sentence = chunk(sentence) crf_model = CRFNERPredictor.Instance() result = crf_model.predict(sentence, format) return result