def AIC_predict(): print("iter : ", config.iter) # 형태소분석된 raw_sentence에 PIC 처리 # input : config.result_input_path # output : config.result_processed_path main_tagger_PIC = Tagger() main_tagger_PIC.taggingPIC("result_tagging") # PIC 처리된 raw_sentence에 AIC 적용 # input : config.result_processed_path # output : config.result_output_path main_tagger_AIC = Tagger() main_tagger_AIC.evaluateAIC("result") main_tagger_AIC.main_taggingAIC(mode="result_tagging")
def __init__(self): # Memuat data pre-trained POS-Tagger uni, bi, tri, word = self.load_obj("tagger") self.TAGGER1 = Tagger(uni, bi, tri, word) # Memuat data pre-trained POS-Tagger uni2, bi2, tri2, word2 = self.load_obj("tagger2") self.TAGGER2 = Tagger(uni2, bi2, tri2, word2) self.TAGGER3 = CRFTagger() self.TAGGER3.set_model_file( 'dataset/all_indo_man_tag_corpus_model.crf.tagger') # Memuat data grammar chunker self.load_chunker()
def main(argv): try: opts, args = getopt.getopt(argv, "hp:q", ["help"]) except getopt.GetoptError: usage() sys.exit(2) path = "" quiet = False for opt, arg in opts: if opt in ('-h', '--help'): usage() sys.exit(0) if opt == '-q': quiet = True elif opt == '-p': path = arg else: usage() sys.exit(0) if not path: usage() sys.exit(0) print("Starting tag scanning...") tagger = Tagger(path, quiet) tagger.scan_audio_files() print("...Done!") print print("Starting file cataloguing...") dic = tagger.get_scanned_dic() cataloguer = Cataloguer(path, dic, quiet) cataloguer.create_catalogue() print("...Done!") print
def mainTag(featureSet, options): transModel = None if not (options['printWeights'] or options['toCRFsuite']): print('loading transition model...', end='', file=sys.stderr, flush=True) transModel = TransModel.getModelFromFile(options['transModelFileName']) print('done', file=sys.stderr, flush=True) tagger = Tagger(featureSet, transModel, options) if 'inFeatFile' in options and options['inFeatFile']: # Tag a featurized file to to outputStream for sen, comment in tagger.tagFeatures(options['inFeatFile']): writeSentence(sen, options['outputStream'], comment) elif 'ioDirs' in options and options['ioDirs']: # Tag all files in a directory file to to fileName.tagged for sen, fileName in tagger.tagDir(options['ioDirs'][0]): writeSentence(sen, open(join(options['ioDirs'][1], '{0}.tagged'.format(fileName)), 'a', encoding='UTF-8')) elif 'toCRFsuite' in options and options['toCRFsuite']: # Make CRFsuite format to outputStream for tagging tagger.toCRFsuite(options['inputStream'], options['outputStream']) elif 'printWeights' in options and options['printWeights']: # Print MaxEnt weights to STDOUT tagger.printWeights(options['printWeights'], options['outputStream']) else: # Tag inputStream to outputStream for sen, comment in tagger.tagCorp(options['inputStream']): writeSentence(sen, options['outputStream'], comment)
def _evaluation(hp, fn_model, data): tagger = Tagger(vocabs=hp['VOCAB'], params=fn_model, hp=hp['HYPERPARAMS']) def data_for_eval(words, postags): sent = [] for w, p in zip(words, postags): p = w + "\t" + p if mecab_system_eval.PY_3 is True: w = w.encode("UTF-8") p = p.encode("UTF-8") sent.append([w, p]) return sent sys_data = [] ans_data = [] indice = [i for i in range(len(data.ws_data))] for i in indice: words = data.words[i] pids = data.pos_data[i][1] postags = [tagger.id2pos[pid] for pid in pids] ans_data.append(data_for_eval(words, postags)) output = tagger.tagging(''.join(words)) sys_words = output.words sys_postags = output.postags sys_data.append(data_for_eval(sys_words, sys_postags)) r = mecab_system_eval.mecab_eval(sys_data, ans_data) _, _, ws_f, _, _, pos_f = mecab_system_eval.calculate_fvalues(r) return ws_f, pos_f
def evaluation(hp, fn_model, data): tagger = Tagger(vocabs=hp['VOCAB'], params=fn_model, hp=hp['HYPERPARAMS']) gold = open(hp['TMP_GOLD'], 'w') pred = open(hp['TMP_PRED'], 'w') indice = [i for i in range(len(data.ws_data))] for i in indice: words = data.words[i] pids = data.pos_data[i][1] postags = [tagger.id2pos[pid] for pid in pids] for w, p in zip(words, postags): gold.write(w + '\t' + p + '\n') gold.write('EOS\n') output = tagger.tagging(''.join(words)) sys_words = output.words sys_postags = output.postags for w, p in zip(sys_words, sys_postags): pred.write(w + '\t' + p + '\n') pred.write('EOS\n') ws_f, pos_f = mecab_eval(hp['TMP_PRED'], hp['TMP_GOLD']) return ws_f, pos_f
def diversity_sampling(feature_now, model_ver, budget): csvfile = 'records_us/'+model_ver+'.csv' model_selected = [] for i in feature_now: with tf.name_scope(model_ver+'/feature_{0}'.format(i)): model = Tagger(model_file=model_ver+'/feature_{0}'.format(i), n_input=FEATURE_SHAPE[i][0],n_steps=FEATURE_SHAPE[i][1],feature_number=i) model.train([],[],feature_number=i) model_selected.append(model) train_data, test_data = data_generation(model_selected, feature_now) train_x_all = train_data[0] train_y_all = train_data[1] test_x_all = test_data[0] test_y_all = test_data[1] episode = 1 print(">>>>>> Playing game ..") while episode <= MAX_EPISODE: sample_N = min(budget*4,len(train_y_all)) N = len(train_y_all) budget = min(budget,N) s = diversitySampling(train_x_all[:,:sample_N], pool = [], budget = budget) s.updateCplus() queried_indexs = s.newind for i in range(len(model_selected)): model_selected[i].train(np.array(train_x_all[i])[queried_indexs], np.array(train_y_all)[queried_indexs], feature_now[i]) print(">>>>>> Terminate ...") write_csv(episode, csvfile, model_selected, train_x_all, test_x_all, train_y_all, test_y_all) episode = episode+1
def test_match_label_IOB_applied_correctly(self): tagger = Tagger() input = [[("Brunel", "", "", ""), ("University", "", "", ""), ("test", "", "", ""), ("test", "", "", "")], [("test", "", "", ""), ("test", "", "", ""), ("Brunel", "", "", ""), ("University", "", "", "")], [("test", "", "", ""), ("test", "", "", ""), ("Brunel", "", "", ""), ("University", "", "", "")]] input_label = "Brunel University" input_match_tag = "match" output = tagger.match_label(input, input_label, input_match_tag) output = tagger.match_label(output, input_label, input_match_tag) output = tagger.match_label(output, input_label, input_match_tag) output = tagger.add_default_entity_tags(output) correct_iob = True for line in output: for token_idx, token in enumerate(line): if token[3].split("-", 1)[0] == "O": next_token = "EOL" if len(line) == token_idx + 1 else line[ token_idx + 1][3].split("-", 1)[0] if next_token == "I": correct_iob = False self.assertEqual(correct_iob, True)
def PIC_train(): # bootstrapping 에 사용될 unlabeled 학습데이터에 PIC 를 적용하는 과정 # train_set : config.trainPIC_path # test_set : config.testPIC_path main_tagger_PIC = Tagger() main_tagger_PIC.main_trainPIC() main_tagger_PIC.evaluatePIC()
def main(): parser = argparse.ArgumentParser(description='Part-of-Speech Tagging.') parser.add_argument( '--prefix', '-p', type=str, default='', help='specify prefix of files which will be used to store model') parser.add_argument('--times', '-t', type=int, default=1, help='specify iteration times') parser.add_argument( '--all', '-a', action='store_true', help='without this switch, model will be trained by random sampled data' ) parser.add_argument('--file', '-f', type=str, default='', help='specify test data file') parser.add_argument('--save', '-s', action='store_true', help='enable this to save model file') args = parser.parse_args() tagger = Tagger('data/wsj00-18.pos', args.times, not args.all, args.save, args.prefix) test_data = Processor(args.file) tagger.benchmark(test_data)
def pos(): post_data = request.json["text"] text_tagger = Tagger() response = text_tagger.run(post_data) return json.dumps(response)
def main_tag(featureSet, options): labelCounter, featCounter = BookKeeper(), BookKeeper() labelCounter.readFromFile('{0}.labelNumbers'.format(options.modelName)) featCounter.readFromFile('{0}.featureNumbers'.format(options.modelName)) optionsDict = vars(options) optionsDict['labelCounter'] = labelCounter optionsDict['featCounter'] = featCounter optionsDict['modelFile'] = '{0}.model'.format(options.modelName) tagger = Tagger(featureSet, optionsDict) if options.inFeatFile: tagger_func = lambda: tagger.tag_features(options.inFeatFile) writer_func = lambda s, c: writeSentence(s, comment=c) elif options.input_dir: assert isdir(options.input_dir), "--input-dir must be a directory" out_dir = "{}_out".format(options.input_dir) os.mkdir(out_dir) tagger_func = lambda: tagger.tag_dir(options.input_dir) writer_func = lambda s, c: writeSentence( s, out=open(join(out_dir, '{}.tagged'.format(c)), 'a')) else: tagger_func = lambda: tagger.tag_corp(sys.stdin) writer_func = lambda s, c: writeSentence(s, comment=c) for sen, other in tagger_func(): writer_func(sen, other)
def learn(self, num_epochs, config_dict, seed): # config_dict contains a chosen value for each parameter model = Tagger(self.modelname, self.datafile, self.paramfile, num_epochs, self.batchsize, **config_dict) # train metrics = model.train(num_epochs, seed, **config_dict) # metrics is dict = {epoch: (model, train_loss, dev_loss,test_loss, acc, f1_macro, f1_weighted)} return metrics
def main(argv): if len(argv) < 3: usage(argv) dic = False freq = False own_tag = False if len(argv) >= 4: if argv[3] == "-d": dic = True if len(argv) >= 4: if argv[3] == "-f": freq = True if len(argv) >= 5: if argv[4] == "-f": freq = True if len(argv) >= 5: if argv[4] == "-f": freq = True if len(argv) >= 4: if argv[3] == "-t": own_tag = True if len(argv) >= 5: if argv[4] == "-t": own_tag = True if len(argv) >= 6: if argv[5] == "-t": own_tag = True ex = Util.read_file(argv[1]) ex = Util.transform_text(ex) models = ["data/location.txt", "data/person.txt", "data/organisation.txt"] # Analyse lexicale lexer = Lexer(ex, own_tag) lexer.lex() # Analyse syntaxique parser = Parser(lexer.get_tokenized_text(), own_tag) parser.parse() # Analyse sémantique + reconnaissance des EN ner = NER(ex, parser.get_parsed_text()) if dic: ner.gen_models(models) ner.apply() # Balisage du texte tagger = Tagger(ner.get_ner(), ex) if freq: tagger.freq_tag(argv[2]) else: tagger.tag(argv[2])
def test_run1(filename, test_product, no_of_clusters): conv = Converter() cl = Cluster() tg = Tagger() targetCategory = test_product['category'] result = cl.test_run(filename, test_product, no_of_clusters) category = tg.readCategory(result) conv.run('r200.txt', result, category, targetCategory, 'newSum1.txt')
def brain(command): response = "" command = command # from 0 =>> 15 is verb for search and find # from 16 =>> 21 is verb for open actions = [ "search", "find", "view", "reach", "detect", "get", "catch", "explore", "achieve", "obtain", "pass", "check", "reveal", "expose", "observe", "show", "see", "listen", "hear", "open", "watch", "arise", "awaken", "call", "consciousness", "get up", "stir", "wake", "wake up" ] tokens = Tokenizer().tokenize(command) # call weather function if there is weather word and country or city name citiesORcountries = weatherFunction(command) if 'weather' in command.split() and citiesORcountries != []: return 'the weather in ' + citiesORcountries[0] + ' is ' + WeatherC( ).weatherForecast(citiesORcountries[0]) + ' today' action = None fileName = None # -----------------------------------<<Variable>>-------------------------------------------- tagSentence = Tagger().tag(tokens) for counter in range(len(tagSentence)): # if tagSentence[counter][1] == 'VB' or tagSentence[counter][0] in self.actions: if tagSentence[counter][0] in actions: action = tagSentence[counter][0] elif tagSentence[counter][1] == 'NN': fileName = tagSentence[counter][0] normlizeAction = Normalizer().snowBallStemmer(action) if normlizeAction in actions: filePath = FileSearch().search( fileName) # return list of file shared the same name if normlizeAction in actions[:15]: # for search about folder or file OpenMedia().openFile(filePath[0].split("//")[0]) response = "i hope you're satisfied with our service" return response if normlizeAction in actions[15:21]: #if he if normlizeAction in [ 'listen', 'hear', 'watch' ] and filePath[0].split('.')[1] != ['mp3', 'mp4', 'mkv']: pass OpenMedia().openFile(filePath[0])
def play_ner(feature_now, model_ver, poly, niter, logit, method): actions = 2 global BUDGET tf.reset_default_graph() if AGENT == "LSTMQ": robot = RobotLSTMQ(actions, FEATURE, content = CONTENT, poly = poly, logit = logit, ntype = NTYPE, expnum = EXPNUM) else: print("** There is no robot.") raise SystemExit ############NEW############################### model_selected = [] for i in feature_now: with tf.name_scope(model_ver+'/feature_{0}'.format(i)): model = Tagger(model_file=model_ver+'/feature_{0}'.format(i), n_input=FEATURE_SHAPE[i][0],n_steps=FEATURE_SHAPE[i][1],feature_number=i, epochs=niter, expnum = EXPNUM) model.train([],[],feature_number = i) model_selected.append(model) game = initialise_game(model_selected,BUDGET,NITER,FEATURE, method) ############################################### # initialise a decision robot # play game episode = 1 rAll = [] while episode <= MAX_EPISODE: observation = game.get_frame(model_selected) action = robot.get_action(observation) reward, observation2, terminal = game.feedback(action, model_selected) game.rAll.append(reward) rAll.append(reward) robot.update(observation, action, reward, observation2, terminal) if terminal == True: print("> Episodes finished: ", float("%.3f" % (episode/MAX_EPISODE)), "> Reward: ", float("%.3f" % np.mean(rAll))) episode += 1 rAll = [] if episode == MAX_EPISODE: print('in') robot.save_Q_network(MODEL_VER) weights = find_weight.find_weight(model_selected, game.dev_x_all, game.dev_y_all) np.save(model_ver+'.npy', weights) print(weights) return robot
def download(self): audio = pafy.new(self.url).getbestaudio() file = audio.download() self.newtitle = self.slugify(audio.title) self.__convertToMp3(file, audio.extension) tagger = Tagger(self.newtitle + '.mp3', self.title, self.artist, self.genre, self.album) mp3 = tagger.editTags() return self.__renameFile(mp3)
def test_pos_tag_same_nr_tokens(self): tagger = Tagger() input, output = self.pos_tag_get_results(tagger) input_nr_tuples = [len(line) for line in input] input_nr_tuples = sum(input_nr_tuples) output_nr_tuples = [len(line) for line in output] output_nr_tuples = sum(output_nr_tuples) self.assertEqual(input_nr_tuples, output_nr_tuples)
def AIC_train(): # bootstrapping에 사용되는 학습데이터는 "PIC_train"을 통해 PIC 처리를 사전에 진행해야 함. for epoch in range(1, config.boot_iter): print("iter : ", config.iter) main_tagger_AIC = Tagger() main_tagger_AIC.main_trainAIC() main_tagger_AIC.evaluateAIC("main_model") self_trainig = main_tagger_AIC.main_taggingAIC( mode="self_tagging") # init self training data\ # 배깅 모델 학습 splited_labels, splited_features, splited_sentences = split_self_labeling( self_trainig[0], self_trainig[1], self_trainig[2]) print("%s_iter(main) -> self_labled_s1 : %s" % (config.iter, len(self_trainig[0]))) for model_idx in range(1, config.model_num + 1): print("model_idx : ", model_idx) bagging_taggerAIC = Tagger() bagging_taggerAIC.bagging_trainAIC("bagging_train", model_idx, splited_features[model_idx - 1], splited_labels[model_idx - 1]) print("bagging model%s acc" % model_idx) bagging_taggerAIC.evaluateAIC("bagging_eval", model_idx) # each bagging model bagging_bootstrap(model_idx, splited_sentences[model_idx - 1], splited_labels[model_idx - 1]) if model_idx == 1: score_i, raw_sentences, features = bagging_taggerAIC.bagging_taggingAIC( "self_tagging", model_idx) else: new_socre_i = bagging_taggerAIC.bagging_taggingAIC( "self_tagging", model_idx) for idx, _ in enumerate(new_socre_i): score_i[idx] = np.asarray(score_i[idx]) + np.asarray( new_socre_i[idx]) predicts = bagging_taggerAIC.score2tag(score_i, raw_sentences, features) main_bootstrap(predicts) config.iter += 1
def tag(self): if (not os.path.exists(self.lemma_file)) or 'tag' in self.args.no_cache: print('Tagging') self.lemmas = Tagger(self.args.obt_path, self.promises).tag() with open(self.lemma_file, 'w') as out: out.write(json.dumps(self.lemmas)) else: print('Reading lemmas') with open(self.lemma_file, 'r') as file: self.lemmas = json.load(file)
def test_match_label_pos_labels_not_altered(self): tagger = Tagger() input, output = self.match_label_get_results(tagger) same_pos_labels_returned = True for line_idx, line in enumerate(output): for tuple_idx, tuple in enumerate(line): if tuple[1] != input[line_idx][tuple_idx][1]: same_pos_labels_returned = False self.assertEqual(same_pos_labels_returned, True)
def __init__(self, config, tagger=False, scraper=False, prepare_db=False): self.config = config if prepare_db and not self._prepare_database(): return if scraper: sc = Scraper(self.config['parlis']['url']) sc.run() if tagger: ta = Tagger() ta.run()
def test_pos_tag_nonlocalner_labels_not_altered(self): tagger = Tagger() input, output = self.pos_tag_get_results(tagger) same_nlner_labels_returned = True for line_idx, line in enumerate(output): for tuple_idx, tuple in enumerate(line): if tuple[2] != input[line_idx][tuple_idx][2]: same_nlner_labels_returned = False self.assertEqual(same_nlner_labels_returned, True)
def test_pos_tag_same_token_strs_returned(self): tagger = Tagger() input, output = self.pos_tag_get_results(tagger) same_tokens_returned = True for line_idx, line in enumerate(output): for tuple_idx, tuple in enumerate(line): if tuple[0] != input[line_idx][tuple_idx][0]: same_tokens_returned = False self.assertEqual(same_tokens_returned, True)
def conservative_sampling(feature_now, model_ver, budget_test, cvit): test_child = [] with open('test_child.txt') as f: for line in f.readlines(): l = line.split()[0] test_child.append(l) #student is the individual test kid print(">>>>>> Playing game ..") model_selected = [] for i in feature_now: with tf.name_scope(model_ver + '/feature_{0}'.format(i)): model = Tagger(model_file=model_ver + '/feature_{0}'.format(i), n_input=FEATURE_SHAPE[i][0], n_steps=FEATURE_SHAPE[i][1], feature_number=i) model_selected.append(model) ID_student = 0 # once ID_student > 14, break_loop while ID_student < len(test_child): train_x_all, train_y_all = test_data(model_selected, feature_now, test_child[ID_student]) test_x_all, test_y_all = train_x_all, train_y_all sample_N = min(budget_test * 4, len(train_y_all)) N = len(train_y_all) budget_test = min(budget_test, N) confidence = [] conf_diff = np.zeros((sample_N, )) for i in range(len(model_selected)): confidence.append(model_selected[i].get_confidence( list(train_x_all[i][:sample_N]))) # the max indecies ind_max = np.argmax(confidence, axis=0) # the min indecies ind_min = np.argmin(confidence, axis=0) for i in range(sample_N): conf_diff[i] = confidence[ind_max[i]][i] - confidence[ ind_min[i]][i] queried_indexs = sorted(range(len(conf_diff)), key=lambda i: conf_diff[i])[:budget_test] for i in range(len(model_selected)): model_selected[i].train_mode_B( np.array(train_x_all[i])[queried_indexs], np.array(train_y_all)[queried_indexs], feature_now[i]) print("training of mode B finished") write_test_csv(model_selected, ID_student, model_ver, cvit, test_x_all, test_y_all) ID_student = ID_student + 1
def Get_TweetTags(data,no_tags,multi_tag_len, dict_path=None): if dict_path is not None: weights = pickle.load(open(dict_path, 'rb')) # or your own dictionary else: weights = pickle.load(open(BASE_DIR+'/Summarizer_Tagger/data/dict.pkl', 'rb')) # default dictionary myreader = tagger.Reader() # or your own reader class mystemmer = tagger.Stemmer() # or your own stemmer class myrater = tagger.Rater(weights,multi_tag_len) # or your own... (you got the idea) mytagger = Tagger(myreader, mystemmer, myrater) best_tags = mytagger(data, no_tags) return best_tags
def test_pos_tag_4_slots_each_tuple(self): tagger = Tagger() input, output = self.pos_tag_get_results(tagger) input_nr_tuple_slots = [ 1 for line in input for tuple in line if len(tuple) == 4 ] input_nr_tuple_slots = sum(input_nr_tuple_slots) output_nr_tuple_slots = [ 1 for line in output for tuple in line if len(tuple) == 4 ] output_nr_tuple_slots = sum(output_nr_tuple_slots) self.assertEqual(input_nr_tuple_slots, output_nr_tuple_slots)
def analyze(self, text, tokenizer=str.split): """Analyze text and return pretty format. Args: text: string, the input text. tokenizer: Tokenize input sentence. Default tokenizer is `str.split`. Returns: res: dict. """ if not self.tagger: self.tagger = Tagger(self.model, preprocessor=self.p, tokenizer=tokenizer) return self.tagger.analyze(text)
def run(): print("start") listener = Recog() tagger = Tagger() print("done setting up") while True: try: sentence = listener.listen() # sentence = "make line graph using range from A1 to E4" tags = tagger.match_rules(sentence) print(tags) process(tags) except KeyboardInterrupt: break except Exception as e: continue