def word_depparser(sentence, is_segmented=False): if not depparser_agent: return [] ## post request m_input = nlpc.parse_prep_input() m_input.sentence = str(utf8_gbk(sentence)) m_input.grain_size = 1 m_input.sentence_segmented = is_segmented input_data = sofa.serialize(m_input) for i in range(5): try: ret, output_data = depparser_agent.call_method(input_data) break except Exception as e: continue if len(output_data) == 0: sys.stderr.write('No result' + sentence + '\n') return [] ## get results m_output = nlpc.depparser_output() m_output = sofa.deserialize(output_data, type(m_output)) tokens = m_output.items depparser_list = [] for i in range(len(tokens)): if len(tokens[i].deprel.strip()) == 0: tokens[i].deprel = '_' word = gbk_utf8(tokens[i].word) depparser_list.append((word, tokens[i].deprel)) return depparser_list
def word_depparser(sentence, is_segmented=False): if not depparser_agent: return []; ## post request m_input = nlpc.parse_prep_input() m_input.sentence = str(utf8_gbk(sentence)) m_input.grain_size = 1 m_input.sentence_segmented = is_segmented input_data = sofa.serialize(m_input) for i in range(5) : try: ret, output_data = depparser_agent.call_method(input_data) break except Exception as e: continue if len(output_data) == 0: sys.stderr.write('No result' + sentence + '\n') return [] ## get results m_output = nlpc.depparser_output() m_output = sofa.deserialize(output_data, type(m_output)) tokens = m_output.items depparser_list = [] for i in range(len(tokens)): if len(tokens[i].deprel.strip()) == 0: tokens[i].deprel = '_' word = gbk_utf8(tokens[i].word) depparser_list.append((word, tokens[i].deprel)) return depparser_list
def word_ner(sentence): if not wordner_agent: return []; language_id = 0 output_id = 1 m_input = nlpc.wordner_input() m_input.lang_id = int(1) m_input.query = str(utf8_gbk(sentence)) input_data = sofa.serialize(m_input) for i in range(5): try: ret, output_data = wordner_agent.call_method(input_data) break except Exception as e: continue if len(output_data) == 0: sys.stderr.write('The server returns None.' + '\n') return []; m_output = nlpc.wordner_output() m_output = sofa.deserialize(output_data, type(m_output)) tags = m_output.tags tags_size = len(tags) word_ner_list = []; for i in range(tags_size): word_ner_list.append((gbk_utf8(tags[i].term), str(tags[i].type), trans_id_short[tags[i].type])); ''' sys.stderr.write(gbk_utf8(tags[i].term) + ' ') if trans_id_short.has_key(tags[i].type): sys.stderr.write(trans_id_short[tags[i].type] + '\t') else: sys.stderr.write('NOR' + '\t') sys.stderr.write('\n') ''' return word_ner_list;
def word_rank(sentence): if not wordrank_agent: return []; ## post request m_input = nlpc.wordseg_input() m_input.lang_id = int(0) m_input.lang_para = int(0) m_input.query = str(utf8_gbk(sentence)) input_data = sofa.serialize(m_input) for i in range(5) : try: ret, output_data = wordrank_agent.call_method(input_data) break except Exception as e: pass; if len(output_data) == 0: sys.stderr.write('No result' + sentence + '\n') return [] ## get results # wordrank_result m_output = nlpc.wordrank_output() m_output = sofa.deserialize(output_data, type(m_output)) rank_result_list = list() list_size = len(m_output.nlpc_trunks_pn) for i in range(list_size): word = m_output.nlpc_trunks_pn[i].buffer word = gbk_utf8(word) rank = m_output.nlpc_trunks_pn[i].rank wght = round(m_output.nlpc_trunks_pn[i].weight,3) rank_result_list.append((word, rank, wght)) return rank_result_list
def word_seg(sent): m_input = nlpc.wordseg_input() m_input.query = str(utf8_gbk(sent)) m_input.lang_id = int(0) m_input.lang_para = int(0) input_data = sofa.serialize(m_input) for i in range(5) : try: ret, output_data = wordseg_agent.call_method(input_data) break except Exception as e: continue if len(output_data) == 0: return []; m_output = nlpc.wordseg_output() m_output = sofa.deserialize(output_data, type(m_output)) m_output = m_output.scw_out ret_data = [] ##seg for i in range(m_output.wpbtermcount): posidx = GET_TERM_POS(m_output.wpbtermpos[i]) poslen = GET_TERM_LEN(m_output.wpbtermpos[i]) word = m_output.wpcompbuf[posidx : posidx + poslen] ret_data.append((str(posidx), str(poslen), gbk_utf8(word))) ''' for i in range(m_output.wsbtermcount): posidx = GET_TERM_POS(m_output.wsbtermpos[i]) poslen = GET_TERM_LEN(m_output.wsbtermpos[i]) word = m_output.wordsepbuf[posidx : posidx + poslen] ret_data.append((posidx, poslen, word)) ''' return ret_data
def word_pos(sentence): if not wordpos_agent: return []; ## post request m_input = nlpc.wordseg_input() m_input.lang_id = int(0) m_input.lang_para = int(0) m_input.query = str(utf8_gbk(sentence)) input_data = sofa.serialize(m_input) for i in range(5) : try: ret, output_data = wordpos_agent.call_method(input_data) break except Exception as e: pass; if len(output_data) == 0: sys.stderr.write('No result' + sentence + '\n') return [] ## get results # wordpos result m_output = nlpc.wordpos_output() m_output = sofa.deserialize(output_data, type(m_output)) tokens_size = len(m_output.nlpc_tokens) segment_result = [] for i in range(tokens_size): stag = get_pos_str(m_output.nlpc_tokens[i].type) if stag: word = m_output.nlpc_tokens[i].buffer word = gbk_utf8(word) segment_result.append((word, stag)) return segment_result
def word_ner(sentence): if not wordner_agent: return [] language_id = 0 output_id = 1 m_input = nlpc.wordner_input() m_input.lang_id = int(1) m_input.query = str(utf8_gbk(sentence)) input_data = sofa.serialize(m_input) for i in range(5): try: ret, output_data = wordner_agent.call_method(input_data) break except Exception as e: continue if len(output_data) == 0: sys.stderr.write('The server returns None.' + '\n') return [] m_output = nlpc.wordner_output() m_output = sofa.deserialize(output_data, type(m_output)) tags = m_output.tags tags_size = len(tags) word_ner_list = [] for i in range(tags_size): word_ner_list.append((gbk_utf8(tags[i].term), str(tags[i].type), trans_id_short[tags[i].type])) ''' sys.stderr.write(gbk_utf8(tags[i].term) + ' ') if trans_id_short.has_key(tags[i].type): sys.stderr.write(trans_id_short[tags[i].type] + '\t') else: sys.stderr.write('NOR' + '\t') sys.stderr.write('\n') ''' return word_ner_list
def word_rank(sentence): if not wordrank_agent: return [] ## post request m_input = nlpc.wordseg_input() m_input.lang_id = int(0) m_input.lang_para = int(0) m_input.query = str(utf8_gbk(sentence)) input_data = sofa.serialize(m_input) for i in range(5): try: ret, output_data = wordrank_agent.call_method(input_data) break except Exception as e: pass if len(output_data) == 0: sys.stderr.write('No result' + sentence + '\n') return [] ## get results # wordrank_result m_output = nlpc.wordrank_output() m_output = sofa.deserialize(output_data, type(m_output)) rank_result_list = list() list_size = len(m_output.nlpc_trunks_pn) for i in range(list_size): word = m_output.nlpc_trunks_pn[i].buffer word = gbk_utf8(word) rank = m_output.nlpc_trunks_pn[i].rank wght = round(m_output.nlpc_trunks_pn[i].weight, 3) rank_result_list.append((word, rank, wght)) return rank_result_list
def word_pos(sentence): if not wordpos_agent: return [] ## post request m_input = nlpc.wordseg_input() m_input.lang_id = int(0) m_input.lang_para = int(0) m_input.query = str(utf8_gbk(sentence)) input_data = sofa.serialize(m_input) for i in range(5): try: ret, output_data = wordpos_agent.call_method(input_data) break except Exception as e: pass if len(output_data) == 0: sys.stderr.write('No result' + sentence + '\n') return [] ## get results # wordpos result m_output = nlpc.wordpos_output() m_output = sofa.deserialize(output_data, type(m_output)) tokens_size = len(m_output.nlpc_tokens) segment_result = [] for i in range(tokens_size): stag = get_pos_str(m_output.nlpc_tokens[i].type) if stag: word = m_output.nlpc_tokens[i].buffer word = gbk_utf8(word) segment_result.append((word, stag)) return segment_result
def word_seg(sent): m_input = nlpc.wordseg_input() m_input.query = str(utf8_gbk(sent)) m_input.lang_id = int(0) m_input.lang_para = int(0) input_data = sofa.serialize(m_input) for i in range(5): try: ret, output_data = wordseg_agent.call_method(input_data) break except Exception as e: continue if len(output_data) == 0: return [] m_output = nlpc.wordseg_output() m_output = sofa.deserialize(output_data, type(m_output)) m_output = m_output.scw_out ret_data = [] ##seg for i in range(m_output.wpbtermcount): posidx = GET_TERM_POS(m_output.wpbtermpos[i]) poslen = GET_TERM_LEN(m_output.wpbtermpos[i]) word = m_output.wpcompbuf[posidx:posidx + poslen] ret_data.append((str(posidx), str(poslen), gbk_utf8(word))) ''' for i in range(m_output.wsbtermcount): posidx = GET_TERM_POS(m_output.wsbtermpos[i]) poslen = GET_TERM_LEN(m_output.wsbtermpos[i]) word = m_output.wordsepbuf[posidx : posidx + poslen] ret_data.append((posidx, poslen, word)) ''' return ret_data
def word_lmscore(sentence): if not lmscore_agent: return 0.0; m_input = nlpc.lmscore_input() m_input.query = str(utf8_gbk(sentence)); m_input.debug_flag = True input_data = sofa.serialize(m_input) for i in range(5): try: ret, output_data = lmscore_agent.call_method(input_data) break except Exception as e: continue if len(output_data) == 0: return 0.0 m_output = nlpc.lmscore_output() m_output = sofa.deserialize(output_data, type(m_output)) return m_output.result.prob;
def _sim(self,q1,q2): m_input = textsim.textsim_input() m_input.query1 = q1 m_input.query2 = q2 input_data = sofa.serialize(m_input) for i in range(5) : try: ret, output_data = self.textsim_agent.call_method(input_data) break except Exception as e: #print e output_data='' continue if len(output_data) == 0: pass else: m_output = textsim.textsim_output() m_output = sofa.deserialize(output_data, type(m_output)) return m_output.textsim
def _sim(self, q1, q2): m_input = textsim.textsim_input() m_input.query1 = q1 m_input.query2 = q2 input_data = sofa.serialize(m_input) for i in range(5): try: ret, output_data = self.textsim_agent.call_method(input_data) break except Exception as e: #print e output_data = '' continue if len(output_data) == 0: pass else: m_output = textsim.textsim_output() m_output = sofa.deserialize(output_data, type(m_output)) return m_output.textsim
def word_lmscore(sentence): if not lmscore_agent: return 0.0 m_input = nlpc.lmscore_input() m_input.query = str(utf8_gbk(sentence)) m_input.debug_flag = True input_data = sofa.serialize(m_input) for i in range(5): try: ret, output_data = lmscore_agent.call_method(input_data) break except Exception as e: continue if len(output_data) == 0: return 0.0 m_output = nlpc.lmscore_output() m_output = sofa.deserialize(output_data, type(m_output)) return m_output.result.prob
def main(): sofa.use('drpc.ver_1_0_0', 'S') sofa.use('nlpc.ver_1_0_0', 'nlpc') conf = sofa.Config() conf.load('./config/drpc_client.xml') #query_agent = S.ClientAgent(conf['sofa.service.nlpc_depparser_uni_query_107']) query_agent = S.ClientAgent( conf['sofa.service.nlpc_depparser_uni_web_107']) in_sentences = [] while True: line = sys.stdin.readline() if len(line) <= 0: break line = line.strip(' \t\n\r') in_sentences.append(str(line)) if len(in_sentences) < 1000: continue m_input = nlpc.depparser_uni_input() m_input.grain_size = 1 m_input.sentence_segmented = False m_input.sentences = in_sentences input_data = sofa.serialize(m_input) for i in range(5): try: ret, output_data = query_agent.call_method(input_data) break except Exception as e: continue if len(output_data) == 0: stdout.write('No result' + '\n') continue m_output = nlpc.depparser_uni_output() m_output = sofa.deserialize(output_data, type(m_output)) dep_sentences = m_output.dep_sentences sent_num = len(dep_sentences) for i in range(sent_num): dep_terms = dep_sentences[i].dep_terms term_num = len(dep_terms) for j in range(term_num): if dep_terms[j].lemma.strip() is None: dep_terms[j].lemma = '_' if dep_terms[j].cpostag.strip() is None: dep_terms[j].cpostag = '_' if dep_terms[j].postag.strip() is None: dep_terms[j].postag = '_' if dep_terms[j].ner.strip() is None: dep_terms[j].ner = '_' if dep_terms[j].feat.strip() is None: dep_terms[j].feat = '_' if dep_terms[j].deprel.strip() is None: dep_terms[j].deprel = '_' sys.stdout.write( str(j) + '\t' + dep_terms[j].word + '\t' + dep_terms[j].lemma + '\t' + dep_terms[j].cpostag + '\t' + dep_terms[j].postag + '\t' + dep_terms[j].ner + '\t' + dep_terms[j].feat + '\t' + str(dep_terms[j].head) + '\t' + dep_terms[j].deprel + '\n') sys.stdout.write('\n') in_sentences = [] if len(in_sentences) > 0: m_input = nlpc.depparser_uni_input() m_input.grain_size = 1 m_input.sentence_segmented = False m_input.sentences = in_sentences input_data = sofa.serialize(m_input) m_input.sentences = in_sentences input_data = sofa.serialize(m_input) for i in range(5): try: ret, output_data = query_agent.call_method(input_data) break except Exception as e: continue if len(output_data) == 0: stdout.write('No result' + '\n') exit m_output = nlpc.depparser_uni_output() m_output = sofa.deserialize(output_data, type(m_output)) dep_sentences = m_output.dep_sentences sent_num = len(dep_sentences) for i in range(sent_num): dep_terms = dep_sentences[i].dep_terms term_num = len(dep_terms) for j in range(term_num): if dep_terms[j].lemma.strip() is None: dep_terms[j].lemma = '_' if dep_terms[j].cpostag.strip() is None: dep_terms[j].cpostag = '_' if dep_terms[j].postag.strip() is None: dep_terms[j].postag = '_' if dep_terms[j].ner.strip() is None: dep_terms[j].ner = '_' if dep_terms[j].feat.strip() is None: dep_terms[j].feat = '_' if dep_terms[j].deprel.strip() is None: dep_terms[j].deprel = '_' sys.stdout.write( str(j) + '\t' + dep_terms[j].word + '\t' + dep_terms[j].lemma + '\t' + dep_terms[j].cpostag + '\t' + dep_terms[j].postag + '\t' + dep_terms[j].ner + '\t' + dep_terms[j].feat + '\t' + str(dep_terms[j].head) + '\t' + dep_terms[j].deprel + '\n') sys.stdout.write('\n') in_sentences = []
def main(): sofa.use('drpc.ver_1_0_0', 'S') sofa.use('nlpc.ver_1_0_0', 'wordseg') conf = sofa.Config() conf.load('./config/drpc_client.xml') wordseg_agent = S.ClientAgent(conf['sofa.service.nlpc_wordseg_3016']) while True: line = stdin.readline() if len(line) <= 0: return line = line.decode('utf-8').encode('gbk') m_input = wordseg.wordseg_input() m_input.query = str(line) m_input.lang_id = int(0) m_input.lang_para = int(0) input_data = sofa.serialize(m_input) for i in range(5) : try: ret, output_data = wordseg_agent.call_method(input_data) break except Exception as e: continue if len(output_data) == 0: stdout.write('No result' + '\n') continue m_output = wordseg.wordseg_output() m_output = sofa.deserialize(output_data, type(m_output)) m_output = m_output.scw_out if len(argv) == 2 and argv[1] == 'basic' or len(argv) == 1: stdout.write('=========== Basic Word Sep Result =============' + '\n') for i in range(m_output.wsbtermcount): posidx = GET_TERM_POS(m_output.wsbtermpos[i]) poslen = GET_TERM_LEN(m_output.wsbtermpos[i]) word = m_output.wordsepbuf[posidx : posidx + poslen] stdout.write('%s ' %word) stdout.write('\n') if len(argv) == 2 and argv[1] == 'segment' or len(argv) == 1: stdout.write('============ Word Phrase Result ==============' + '\n') for i in range(m_output.wpbtermcount): posidx = GET_TERM_POS(m_output.wpbtermpos[i]) poslen = GET_TERM_LEN(m_output.wpbtermpos[i]) word = m_output.wpcompbuf[posidx : posidx + poslen] stdout.write('%s ' %word) stdout.write('\n') if len(argv) == 2 and argv[1] == 'phrase' or len(argv) == 1: stdout.write('============ Sub Phrase Result ==============' + '\n') for i in range(m_output.spbtermcount): posidx = GET_TERM_POS(m_output.spbtermpos[i]) poslen = GET_TERM_LEN(m_output.spbtermpos[i]) word = m_output.subphrbuf[posidx : posidx + poslen] stdout.write('%s ' %word) stdout.write('\n') if len(argv) == 2 and argv[1] == 'new' or len(argv) == 1: stdout.write('============ New Word Result ==============' + '\n') pnewword = m_output.pnewword for i in range(pnewword.newwordbtermcount): posidx = GET_TERM_POS(pnewword.newwordbtermpos[i]) poslen = GET_TERM_LEN(pnewword.newwordbtermpos[i]) word = pnewword.newwordbuf[posidx : posidx + poslen] stdout.write('%s ' %word) stdout.write('\n') if len(argv) == 2 and argv[1] == 'human' or len(argv) == 1: stdout.write('=========== Human Name Result ==============' + '\n') for i in range(m_output.namebtermcount): posidx = GET_TERM_POS(m_output.namebtermpos[i]) poslen = GET_TERM_LEN(m_output.namebtermpos[i]) word = m_output.namebuf[posidx : posidx + poslen] stdout.write('%s ' %word) stdout.write('\n') if len(argv) == 2 and argv[1] == 'book' or len(argv) == 1: stdout.write('=============== book names =================' + '\n') for i in range(m_output.bnbtermcount): posidx = GET_TERM_POS(m_output.bnbtermpos[i]) poslen = GET_TERM_LEN(m_output.bnbtermpos[i]) word = m_output.booknamebuf[posidx : posidx + poslen] stdout.write('%s ' %word) stdout.write('\n')
def main(): sofa.use('drpc.ver_1_0_0', 'S') sofa.use('nlpc.ver_1_0_0', 'nlpc') conf = sofa.Config() conf.load('./config/drpc_client.xml') #query_agent = S.ClientAgent(conf['sofa.service.nlpc_depparser_uni_query_107']) query_agent = S.ClientAgent(conf['sofa.service.nlpc_depparser_uni_web_107']) in_sentences = [] while True: line = sys.stdin.readline() if len(line) <= 0: break line = line.strip(' \t\n\r') in_sentences.append(str(line)) if len(in_sentences) < 1000: continue m_input = nlpc.depparser_uni_input() m_input.grain_size = 1 m_input.sentence_segmented = False m_input.sentences = in_sentences input_data = sofa.serialize(m_input) for i in range(5): try: ret, output_data = query_agent.call_method(input_data) break except Exception as e: continue if len(output_data) == 0: stdout.write('No result' + '\n') continue m_output = nlpc.depparser_uni_output() m_output = sofa.deserialize(output_data, type(m_output)) dep_sentences = m_output.dep_sentences sent_num = len(dep_sentences) for i in range(sent_num): dep_terms = dep_sentences[i].dep_terms term_num = len(dep_terms) for j in range(term_num): if dep_terms[j].lemma.strip() is None: dep_terms[j].lemma = '_' if dep_terms[j].cpostag.strip() is None: dep_terms[j].cpostag = '_' if dep_terms[j].postag.strip() is None: dep_terms[j].postag = '_' if dep_terms[j].ner.strip() is None: dep_terms[j].ner = '_' if dep_terms[j].feat.strip() is None: dep_terms[j].feat = '_' if dep_terms[j].deprel.strip() is None: dep_terms[j].deprel = '_' sys.stdout.write(str(j) + '\t' + dep_terms[j].word + '\t' + dep_terms[j].lemma + '\t' + dep_terms[j].cpostag + '\t' + dep_terms[j].postag + '\t' + dep_terms[j].ner + '\t' + dep_terms[j].feat + '\t' + str(dep_terms[j].head) + '\t' + dep_terms[j].deprel + '\n') sys.stdout.write('\n') in_sentences = [] if len(in_sentences) > 0: m_input = nlpc.depparser_uni_input() m_input.grain_size = 1 m_input.sentence_segmented = False m_input.sentences = in_sentences input_data = sofa.serialize(m_input) m_input.sentences = in_sentences input_data = sofa.serialize(m_input) for i in range(5): try: ret, output_data = query_agent.call_method(input_data) break except Exception as e: continue if len(output_data) == 0: stdout.write('No result' + '\n') exit m_output = nlpc.depparser_uni_output() m_output = sofa.deserialize(output_data, type(m_output)) dep_sentences = m_output.dep_sentences sent_num = len(dep_sentences) for i in range(sent_num): dep_terms = dep_sentences[i].dep_terms term_num = len(dep_terms) for j in range(term_num): if dep_terms[j].lemma.strip() is None: dep_terms[j].lemma = '_' if dep_terms[j].cpostag.strip() is None: dep_terms[j].cpostag = '_' if dep_terms[j].postag.strip() is None: dep_terms[j].postag = '_' if dep_terms[j].ner.strip() is None: dep_terms[j].ner = '_' if dep_terms[j].feat.strip() is None: dep_terms[j].feat = '_' if dep_terms[j].deprel.strip() is None: dep_terms[j].deprel = '_' sys.stdout.write(str(j) + '\t' + dep_terms[j].word + '\t' + dep_terms[j].lemma + '\t' + dep_terms[j].cpostag + '\t' + dep_terms[j].postag + '\t' + dep_terms[j].ner + '\t' + dep_terms[j].feat + '\t' + str(dep_terms[j].head) + '\t' + dep_terms[j].deprel + '\n') sys.stdout.write('\n') in_sentences = []