def main(): parser = argparse.ArgumentParser(description='Part-of-Speech Tagging.') parser.add_argument( '--prefix', '-p', type=str, default='', help='specify prefix of files which will be used to store model') parser.add_argument('--times', '-t', type=int, default=1, help='specify iteration times') parser.add_argument( '--all', '-a', action='store_true', help='without this switch, model will be trained by random sampled data' ) parser.add_argument('--file', '-f', type=str, default='', help='specify test data file') parser.add_argument('--save', '-s', action='store_true', help='enable this to save model file') args = parser.parse_args() tagger = Tagger('data/wsj00-18.pos', args.times, not args.all, args.save, args.prefix) test_data = Processor(args.file) tagger.benchmark(test_data)
def main_tag(featureSet, options): labelCounter, featCounter = BookKeeper(), BookKeeper() labelCounter.readFromFile('{0}.labelNumbers'.format(options.modelName)) featCounter.readFromFile('{0}.featureNumbers'.format(options.modelName)) optionsDict = vars(options) optionsDict['labelCounter'] = labelCounter optionsDict['featCounter'] = featCounter optionsDict['modelFile'] = '{0}.model'.format(options.modelName) tagger = Tagger(featureSet, optionsDict) if options.inFeatFile: tagger_func = lambda: tagger.tag_features(options.inFeatFile) writer_func = lambda s, c: writeSentence(s, comment=c) elif options.input_dir: assert isdir(options.input_dir), "--input-dir must be a directory" out_dir = "{}_out".format(options.input_dir) os.mkdir(out_dir) tagger_func = lambda: tagger.tag_dir(options.input_dir) writer_func = lambda s, c: writeSentence( s, out=open(join(out_dir, '{}.tagged'.format(c)), 'a')) else: tagger_func = lambda: tagger.tag_corp(sys.stdin) writer_func = lambda s, c: writeSentence(s, comment=c) for sen, other in tagger_func(): writer_func(sen, other)
def evaluation(hp, fn_model, data): tagger = Tagger(vocabs=hp['VOCAB'], params=fn_model, hp=hp['HYPERPARAMS']) gold = open(hp['TMP_GOLD'], 'w') pred = open(hp['TMP_PRED'], 'w') indice = [i for i in range(len(data.ws_data))] for i in indice: words = data.words[i] pids = data.pos_data[i][1] postags = [tagger.id2pos[pid] for pid in pids] for w, p in zip(words, postags): gold.write(w + '\t' + p + '\n') gold.write('EOS\n') output = tagger.tagging(''.join(words)) sys_words = output.words sys_postags = output.postags for w, p in zip(sys_words, sys_postags): pred.write(w + '\t' + p + '\n') pred.write('EOS\n') ws_f, pos_f = mecab_eval(hp['TMP_PRED'], hp['TMP_GOLD']) return ws_f, pos_f
def pos(): post_data = request.json["text"] text_tagger = Tagger() response = text_tagger.run(post_data) return json.dumps(response)
def __init__(self, n=3, glm=False): Reader.__init__(self, n=n) Tagger.__init__(self, glm=glm) self.tags = set() self.tokens = set() self.emission_params = {} self.transition_params = {}
def _evaluation(hp, fn_model, data): tagger = Tagger(vocabs=hp['VOCAB'], params=fn_model, hp=hp['HYPERPARAMS']) def data_for_eval(words, postags): sent = [] for w, p in zip(words, postags): p = w + "\t" + p if mecab_system_eval.PY_3 is True: w = w.encode("UTF-8") p = p.encode("UTF-8") sent.append([w, p]) return sent sys_data = [] ans_data = [] indice = [i for i in range(len(data.ws_data))] for i in indice: words = data.words[i] pids = data.pos_data[i][1] postags = [tagger.id2pos[pid] for pid in pids] ans_data.append(data_for_eval(words, postags)) output = tagger.tagging(''.join(words)) sys_words = output.words sys_postags = output.postags sys_data.append(data_for_eval(sys_words, sys_postags)) r = mecab_system_eval.mecab_eval(sys_data, ans_data) _, _, ws_f, _, _, pos_f = mecab_system_eval.calculate_fvalues(r) return ws_f, pos_f
def diversity_sampling(feature_now, model_ver, budget): csvfile = 'records_us/'+model_ver+'.csv' model_selected = [] for i in feature_now: with tf.name_scope(model_ver+'/feature_{0}'.format(i)): model = Tagger(model_file=model_ver+'/feature_{0}'.format(i), n_input=FEATURE_SHAPE[i][0],n_steps=FEATURE_SHAPE[i][1],feature_number=i) model.train([],[],feature_number=i) model_selected.append(model) train_data, test_data = data_generation(model_selected, feature_now) train_x_all = train_data[0] train_y_all = train_data[1] test_x_all = test_data[0] test_y_all = test_data[1] episode = 1 print(">>>>>> Playing game ..") while episode <= MAX_EPISODE: sample_N = min(budget*4,len(train_y_all)) N = len(train_y_all) budget = min(budget,N) s = diversitySampling(train_x_all[:,:sample_N], pool = [], budget = budget) s.updateCplus() queried_indexs = s.newind for i in range(len(model_selected)): model_selected[i].train(np.array(train_x_all[i])[queried_indexs], np.array(train_y_all)[queried_indexs], feature_now[i]) print(">>>>>> Terminate ...") write_csv(episode, csvfile, model_selected, train_x_all, test_x_all, train_y_all, test_y_all) episode = episode+1
def main(argv): try: opts, args = getopt.getopt(argv, "hp:q", ["help"]) except getopt.GetoptError: usage() sys.exit(2) path = "" quiet = False for opt, arg in opts: if opt in ('-h', '--help'): usage() sys.exit(0) if opt == '-q': quiet = True elif opt == '-p': path = arg else: usage() sys.exit(0) if not path: usage() sys.exit(0) print("Starting tag scanning...") tagger = Tagger(path, quiet) tagger.scan_audio_files() print("...Done!") print print("Starting file cataloguing...") dic = tagger.get_scanned_dic() cataloguer = Cataloguer(path, dic, quiet) cataloguer.create_catalogue() print("...Done!") print
def learn(self, num_epochs, config_dict, seed): # config_dict contains a chosen value for each parameter model = Tagger(self.modelname, self.datafile, self.paramfile, num_epochs, self.batchsize, **config_dict) # train metrics = model.train(num_epochs, seed, **config_dict) # metrics is dict = {epoch: (model, train_loss, dev_loss,test_loss, acc, f1_macro, f1_weighted)} return metrics
def __init__(self): Tagger.__init__(self) self.upos = [] self.model = Pipeline([ ('vectorizer', DictVectorizer()), ('classifier', LogisticRegressionCV(Cs=10, fit_intercept=True, cv=None, dual=False, penalty='l2', scoring=None, \ solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=1, verbose=0, refit=True, intercept_scaling=1.0, \ multi_class='ovr', random_state=None)) ])
def test_run1(filename, test_product, no_of_clusters): conv = Converter() cl = Cluster() tg = Tagger() targetCategory = test_product['category'] result = cl.test_run(filename, test_product, no_of_clusters) category = tg.readCategory(result) conv.run('r200.txt', result, category, targetCategory, 'newSum1.txt')
def play_ner(feature_now, model_ver, poly, niter, logit, method): actions = 2 global BUDGET tf.reset_default_graph() if AGENT == "LSTMQ": robot = RobotLSTMQ(actions, FEATURE, content = CONTENT, poly = poly, logit = logit, ntype = NTYPE, expnum = EXPNUM) else: print("** There is no robot.") raise SystemExit ############NEW############################### model_selected = [] for i in feature_now: with tf.name_scope(model_ver+'/feature_{0}'.format(i)): model = Tagger(model_file=model_ver+'/feature_{0}'.format(i), n_input=FEATURE_SHAPE[i][0],n_steps=FEATURE_SHAPE[i][1],feature_number=i, epochs=niter, expnum = EXPNUM) model.train([],[],feature_number = i) model_selected.append(model) game = initialise_game(model_selected,BUDGET,NITER,FEATURE, method) ############################################### # initialise a decision robot # play game episode = 1 rAll = [] while episode <= MAX_EPISODE: observation = game.get_frame(model_selected) action = robot.get_action(observation) reward, observation2, terminal = game.feedback(action, model_selected) game.rAll.append(reward) rAll.append(reward) robot.update(observation, action, reward, observation2, terminal) if terminal == True: print("> Episodes finished: ", float("%.3f" % (episode/MAX_EPISODE)), "> Reward: ", float("%.3f" % np.mean(rAll))) episode += 1 rAll = [] if episode == MAX_EPISODE: print('in') robot.save_Q_network(MODEL_VER) weights = find_weight.find_weight(model_selected, game.dev_x_all, game.dev_y_all) np.save(model_ver+'.npy', weights) print(weights) return robot
def download(self): audio = pafy.new(self.url).getbestaudio() file = audio.download() self.newtitle = self.slugify(audio.title) self.__convertToMp3(file, audio.extension) tagger = Tagger(self.newtitle + '.mp3', self.title, self.artist, self.genre, self.album) mp3 = tagger.editTags() return self.__renameFile(mp3)
def mainTag(featureSet, options): transModel = None if not (options['printWeights'] or options['toCRFsuite']): print('loading transition model...', end='', file=sys.stderr, flush=True) transModel = TransModel.getModelFromFile(options['transModelFileName']) print('done', file=sys.stderr, flush=True) tagger = Tagger(featureSet, transModel, options) if 'inFeatFile' in options and options['inFeatFile']: # Tag a featurized file to to outputStream for sen, comment in tagger.tagFeatures(options['inFeatFile']): writeSentence(sen, options['outputStream'], comment) elif 'ioDirs' in options and options['ioDirs']: # Tag all files in a directory file to to fileName.tagged for sen, fileName in tagger.tagDir(options['ioDirs'][0]): writeSentence(sen, open(join(options['ioDirs'][1], '{0}.tagged'.format(fileName)), 'a', encoding='UTF-8')) elif 'toCRFsuite' in options and options['toCRFsuite']: # Make CRFsuite format to outputStream for tagging tagger.toCRFsuite(options['inputStream'], options['outputStream']) elif 'printWeights' in options and options['printWeights']: # Print MaxEnt weights to STDOUT tagger.printWeights(options['printWeights'], options['outputStream']) else: # Tag inputStream to outputStream for sen, comment in tagger.tagCorp(options['inputStream']): writeSentence(sen, options['outputStream'], comment)
def __init__(self, config, tagger=False, scraper=False, prepare_db=False): self.config = config if prepare_db and not self._prepare_database(): return if scraper: sc = Scraper(self.config['parlis']['url']) sc.run() if tagger: ta = Tagger() ta.run()
def genKeyWords(self, question): questionToken = self.preProcess(question) tagger = Tagger('portugues') token = tagger.classify(questionToken) keyList = [] for tok in token: if tok[1] == 'N' or re.match('ADJ', tok[1]) or re.match('V', tok[1]): keyList.append(tok) print keyList print len(keyList) return keyList
def AIC_predict(): print("iter : ", config.iter) # 형태소분석된 raw_sentence에 PIC 처리 # input : config.result_input_path # output : config.result_processed_path main_tagger_PIC = Tagger() main_tagger_PIC.taggingPIC("result_tagging") # PIC 처리된 raw_sentence에 AIC 적용 # input : config.result_processed_path # output : config.result_output_path main_tagger_AIC = Tagger() main_tagger_AIC.evaluateAIC("result") main_tagger_AIC.main_taggingAIC(mode="result_tagging")
def __init__(self): # Memuat data pre-trained POS-Tagger uni, bi, tri, word = self.load_obj("tagger") self.TAGGER1 = Tagger(uni, bi, tri, word) # Memuat data pre-trained POS-Tagger uni2, bi2, tri2, word2 = self.load_obj("tagger2") self.TAGGER2 = Tagger(uni2, bi2, tri2, word2) self.TAGGER3 = CRFTagger() self.TAGGER3.set_model_file( 'dataset/all_indo_man_tag_corpus_model.crf.tagger') # Memuat data grammar chunker self.load_chunker()
def gen_opt(self, file_text): ''' método que gera o novo texto. cada palavra é classificada e concatenada com o tipo através do caractere / depois, é concatenado com o retorno do synset. ''' tagger = Tagger('portugues') tok = word_tokenize(file_text.read().decode('utf-8')) clas = tagger.classify(tok) p_text = [] for c in clas: if c[1] == 'N' or re.match('ADJ',c[1]) or re.match('V',c[1]) or c[1] == '.': gen_set = self.gen_synset(c) p_text.append(gen_set) optimized_text = ' '.join(p_text) return optimized_text
def test_match_label_IOB_applied_correctly(self): tagger = Tagger() input = [[("Brunel", "", "", ""), ("University", "", "", ""), ("test", "", "", ""), ("test", "", "", "")], [("test", "", "", ""), ("test", "", "", ""), ("Brunel", "", "", ""), ("University", "", "", "")], [("test", "", "", ""), ("test", "", "", ""), ("Brunel", "", "", ""), ("University", "", "", "")]] input_label = "Brunel University" input_match_tag = "match" output = tagger.match_label(input, input_label, input_match_tag) output = tagger.match_label(output, input_label, input_match_tag) output = tagger.match_label(output, input_label, input_match_tag) output = tagger.add_default_entity_tags(output) correct_iob = True for line in output: for token_idx, token in enumerate(line): if token[3].split("-", 1)[0] == "O": next_token = "EOL" if len(line) == token_idx + 1 else line[ token_idx + 1][3].split("-", 1)[0] if next_token == "I": correct_iob = False self.assertEqual(correct_iob, True)
def analyze(self, text, tokenizer=str.split): """Analyze text and return pretty format. Args: text: string, the input text. tokenizer: Tokenize input sentence. Default tokenizer is `str.split`. Returns: res: dict. """ if not self.tagger: self.tagger = Tagger(self.model, preprocessor=self.p, tokenizer=tokenizer) return self.tagger.analyze(text)
def run(): print("start") listener = Recog() tagger = Tagger() print("done setting up") while True: try: sentence = listener.listen() # sentence = "make line graph using range from A1 to E4" tags = tagger.match_rules(sentence) print(tags) process(tags) except KeyboardInterrupt: break except Exception as e: continue
def conservative_sampling(feature_now, model_ver, budget): csvfile = 'records_cs/'+model_ver+'.csv' model_selected = [] for i in feature_now: with tf.name_scope(model_ver+'/feature_{0}'.format(i)): model = Tagger(model_file=model_ver+'/feature_{0}'.format(i), n_input=FEATURE_SHAPE[i][0],n_steps=FEATURE_SHAPE[i][1],feature_number=i) model.train([],[],feature_number=i) model_selected.append(model) train_data, test_data = data_generation(model_selected, feature_now) train_x_all = train_data[0] test_x_all = test_data[0] train_y_all = train_data[1] test_y_all = test_data[1] episode = 1 print(">>>>>> Playing game ..") while episode <= MAX_EPISODE: # compute uncertaity, which is 1-confidence: sample_N = min(budget*4,len(train_y_all)) N = len(train_y_all) budget = min(budget,N) confidence = [] conf_diff = np.zeros((sample_N,)) for i in range(len(model_selected)): confidence.append(model_selected[i].get_confidence(list(train_x_all[i][:sample_N]))) # the max indecies ind_max = np.argmax(confidence, axis=0) # the min indecies ind_min = np.argmin(confidence, axis=0) for i in range(sample_N): conf_diff[i] = confidence[ind_max[i]][i]-confidence[ind_min[i]][i] queried_indexs = sorted(range(len(conf_diff)), key=lambda i: conf_diff[i])[:budget] print('top uncertainties found') for i in range(len(model_selected)): model_selected[i].train(np.array(train_x_all[i])[queried_indexs], np.array(train_y_all)[queried_indexs], feature_now[i]) print(">>>>>> Terminate ...") write_csv(episode, csvfile, model_selected, train_x_all, test_x_all, train_y_all, test_y_all) episode = episode+1
class SearchEngine: """ classe que considero a principal desse modulo. É a estrutura de dados que contém os arquivos para o processamento da engine. """ def __init__(self): self.tagger = Tagger('portugues') def insert(self, files): """ Esse método tem como entrada um array de arquivos, retornando uma lista de indexação reversa. """ dataset = [] for f in files: paragraph = sent_tokenize(f[1].lower()) for index, p in enumerate(paragraph): words = word_tokenize(p) classes = self.tagger.classify(words) for c in classes: if re.match('N', c[1]): keysId = [item['_id'] for item in dataset] print 'qtd chaves: ' + str(len(keysId)) if c[0] in keysId: ind = keysId.index(c[0]) files = dataset[ind] if os.path.basename(f[0]) in files.keys(): if not index in dataset[ind][os.path.basename(f[0])]: dataset[ind][os.path.basename(f[0])].append(index) else: dataset[ind][os.path.basename(f[0])] = [index] else: dataset.append({'_id':c[0], os.path.basename(f[0]):[index]}) return dataset def extract(self, data): """ Algoritmo de busca simples que retorna um tupla com os arquivos. """ print 'vim pelo método extract' for d in data: paragraphs = [] try: d.pop('_id') except KeyError: 'ok' for k in d.keys(): # o [1:] para eliminar a primeira chave! #path_name = os.path.abspath('backend') path_name = os.path.abspath('backend') + '/texts/' #text = open(path_name+'\\texts\\'+ k + '.txt').read().decode('utf-8') # print path_name+'\\'+ k + '.txt' text = open(path_name+ k + '.txt').read().decode('utf-8') text_sent = sent_tokenize(text) for index in d[k]: paragraphs.append(text_sent[index]) print paragraphs return set(paragraphs)
def tag_file(training_filename, input_filename, output_filename): print('Training: ' + training_filename) tagger = Tagger(*process_pos_file(training_filename)) print('Reading input: ' + input_filename) sentences = read_words_file(input_filename) print('Writing tagged output: ' + output_filename) count_tags = 0 with open(output_filename, 'w') as output_file: length = len([word for sent in sentences for word in sent]) for tagged_sentence in tagger.tag(sentences): for (tag, word) in tagged_sentence: output_file.write(tag + '\t' + word + '\n') count_tags += 1 show_progress(count_tags / length) output_file.write('\n') print('Complete! Total tags: ' + str(count_tags) + '.')
def brain(command): response = "" command = command # from 0 =>> 15 is verb for search and find # from 16 =>> 21 is verb for open actions = [ "search", "find", "view", "reach", "detect", "get", "catch", "explore", "achieve", "obtain", "pass", "check", "reveal", "expose", "observe", "show", "see", "listen", "hear", "open", "watch", "arise", "awaken", "call", "consciousness", "get up", "stir", "wake", "wake up" ] tokens = Tokenizer().tokenize(command) # call weather function if there is weather word and country or city name citiesORcountries = weatherFunction(command) if 'weather' in command.split() and citiesORcountries != []: return 'the weather in ' + citiesORcountries[0] + ' is ' + WeatherC( ).weatherForecast(citiesORcountries[0]) + ' today' action = None fileName = None # -----------------------------------<<Variable>>-------------------------------------------- tagSentence = Tagger().tag(tokens) for counter in range(len(tagSentence)): # if tagSentence[counter][1] == 'VB' or tagSentence[counter][0] in self.actions: if tagSentence[counter][0] in actions: action = tagSentence[counter][0] elif tagSentence[counter][1] == 'NN': fileName = tagSentence[counter][0] normlizeAction = Normalizer().snowBallStemmer(action) if normlizeAction in actions: filePath = FileSearch().search( fileName) # return list of file shared the same name if normlizeAction in actions[:15]: # for search about folder or file OpenMedia().openFile(filePath[0].split("//")[0]) response = "i hope you're satisfied with our service" return response if normlizeAction in actions[15:21]: #if he if normlizeAction in [ 'listen', 'hear', 'watch' ] and filePath[0].split('.')[1] != ['mp3', 'mp4', 'mkv']: pass OpenMedia().openFile(filePath[0])
def test_pos_tag_same_nr_tokens(self): tagger = Tagger() input, output = self.pos_tag_get_results(tagger) input_nr_tuples = [len(line) for line in input] input_nr_tuples = sum(input_nr_tuples) output_nr_tuples = [len(line) for line in output] output_nr_tuples = sum(output_nr_tuples) self.assertEqual(input_nr_tuples, output_nr_tuples)
def textAnalyse(self, text): t = Tagger(text) wordcloud = t.pos_tag() lines = text.split('.') pol_val = 0.0 result = '' size = len(lines) for line in lines: temp = self.sia.polarity_scores(line) pol_val += temp['compound'] print line, temp pol_val = pol_val / size if pol_val < -0.1: result = "n" elif pol_val > 0.1: result = "p" else: result = "x" print pol_val, result return (result, abs(pol_val) * 100.0, wordcloud)
def tag(self): if (not os.path.exists(self.lemma_file)) or 'tag' in self.args.no_cache: print('Tagging') self.lemmas = Tagger(self.args.obt_path, self.promises).tag() with open(self.lemma_file, 'w') as out: out.write(json.dumps(self.lemmas)) else: print('Reading lemmas') with open(self.lemma_file, 'r') as file: self.lemmas = json.load(file)
def testCleanSeparateLineFeed(self): self.assertEqual( Tagger.cleanSeparateLineFeeds([ 'a\tbold\n', '\n', 'b\titalic\n', 'c\tunderline\n', '\n', 'd\tteletype\n', 'e\tsize:1\n', '\n', 'f\tsize:7\n', 'g\tcolor:000000\n', '\n', 'h\tcolor:FFFFFF\n', '\n' ]), [ 'a\tbold\n', 'b\titalic\n', 'c\tunderline\n', 'd\tteletype\n', 'e\tsize:1\n', 'f\tsize:7\n', 'g\tcolor:000000\n', 'h\tcolor:FFFFFF\n' ])
def test_pos_tag_nonlocalner_labels_not_altered(self): tagger = Tagger() input, output = self.pos_tag_get_results(tagger) same_nlner_labels_returned = True for line_idx, line in enumerate(output): for tuple_idx, tuple in enumerate(line): if tuple[2] != input[line_idx][tuple_idx][2]: same_nlner_labels_returned = False self.assertEqual(same_nlner_labels_returned, True)
def test_match_label_pos_labels_not_altered(self): tagger = Tagger() input, output = self.match_label_get_results(tagger) same_pos_labels_returned = True for line_idx, line in enumerate(output): for tuple_idx, tuple in enumerate(line): if tuple[1] != input[line_idx][tuple_idx][1]: same_pos_labels_returned = False self.assertEqual(same_pos_labels_returned, True)
def test_pos_tag_same_token_strs_returned(self): tagger = Tagger() input, output = self.pos_tag_get_results(tagger) same_tokens_returned = True for line_idx, line in enumerate(output): for tuple_idx, tuple in enumerate(line): if tuple[0] != input[line_idx][tuple_idx][0]: same_tokens_returned = False self.assertEqual(same_tokens_returned, True)
def pos_tag(tweets): """ Uses the POS tagger interface to tag part-of-speech in all the tweets texts, stores it as dict in the tweet objects. """ print "Tagging..." untagged_texts = [] for tweet in tweets: tagger = Tagger() textbody = tweet.text for phrase in re.split("\.|!|\?", textbody): if len(phrase)<2: continue phrase = string.replace(phrase, "?", "") phrase = string.replace(phrase, "!", "") phrase = string.replace(phrase, ".", "") tags = tagger.tag_text(phrase) if tags!=None: tweet.tagged_words.append(tags) print "Untagged texts: " for text in untagged_texts: print text print "Tagging done." return tweets
def __init__(self, p): self.save_dir, _ = generate_directory(p.save_to) self.p = p print_p(self.p) self.tagger = Tagger.create_tagger(self.p) if 'load_from' in self.p and (self.p.load_from is not None): self.load_model(self.p.load_from) logger.info('Setting up data...') self.streams = setup_data(self.p, use_unlabeled=True, use_labeled=True)
def main(training_file, training_dir, load_model, skip_train): logging.debug('Initializing random seed to 0.') random.seed(0) np.random.seed(0) if load_model: tagger = Tagger.load(load_model) data = TaggingDataset.load_from_file(training_file, vocab=tagger.vocab, tags=tagger.tags) else: assert not skip_train, 'Cannot --skip_train without a saved model.' logging.debug('Loading dataset from: %s' % training_file) data = TaggingDataset.load_from_file(training_file) logging.debug('Initializing model.') tagger = Tagger(data.vocab, data.tags) if not skip_train: train_data, dev_data = data.split(0.7) batches_train = train_data.prepare_batches(n_seqs_per_batch=10) batches_dev = dev_data.prepare_batches(n_seqs_per_batch=100) train_mgr = TrainingManager( avg_n_losses=len(batches_train), training_dir=training_dir, tagger_taste_fn=lambda: taste_tagger(tagger, batches_train), tagger_dev_eval_fn=lambda: eval_tagger(tagger, batches_dev), tagger_save_fn=lambda fname: tagger.save(fname) ) logging.debug('Starting training.') while train_mgr.should_continue(): mb_x, mb_y = random.choice(batches_train) mb_loss = tagger.learn(mb_x, mb_y) train_mgr.tick(mb_loss=mb_loss) evaluate_tagger_and_writeout(tagger)
def generation(self): self.tokenized = [nltk.word_tokenize(self.sentences[i]) for i in range(len(self.sentences))] self.generate_average_position() self.types = {} tagger = Tagger(False) for i in range(len(self.tokenized)): typess = tagger.tag_sent(self.tokenized[i]) for j in range(len(typess)): word,val = typess[j] if(not word in self.types): self.types[word] = [] self.types[word].append(val) else: self.types[word].append(val) for element in self.types: most_common,num_most_common = Counter(self.types[element]).most_common(1)[0] self.types[element] =most_common num_sent = 1 for sent in self.tokenized: actual = sent num_word = 1 last = None for mot in actual: actual = None if(not self.isWordIn(mot)): tmp = Etiquette(mot,num_sent,num_word) self.nodes.append(tmp) actual = tmp else: actual = self.get_node_with_value(mot) actual.add_sid_pid(num_sent,num_word) if(num_word>1): last.add_next(actual.get_id()) last = actual num_word +=1 num_sent +=1
def main(args): logging.debug('Initializing random seed to 0.') random.seed(0) np.random.seed(0) tf.set_random_seed(0) logging.debug('Loading training dataset from: %s' % args.training_file) train_data = TaggingDataset.load_from_file(args.training_file) dev_data = TaggingDataset.load_from_file(None, vocab=train_data.vocab, alphabet=train_data.alphabet, tags=train_data.tags) logging.debug('Initializing model.') tagger = Tagger(train_data.vocab, train_data.tags, train_data.alphabet, word_embedding_size=args.word_embedding_size, char_embedding_size=args.char_embedding_size, num_chars=args.max_word_length, num_steps=args.max_sentence_length, optimizer_desc=args.optimizer, generate_lemmas=args.generate_lemmas, l2=args.l2, dropout_prob_values=[float(x) for x in args.dropout.split(",")], experiment_name=args.exp_name, supply_form_characters_to_lemma=args.supply_form_characters_to_lemma, threads=args.threads, use_attention=args.use_attention, scheduled_sampling=args.scheduled_sampling) batches_train = train_data.prepare_batches( args.batch_size, args.max_sentence_length, args.max_word_length) batches_dev = dev_data.prepare_batches( 2100, args.max_sentence_length, args.max_word_length) train_mgr = TrainingManager( len(batches_train), args.eval_interval, training_dir=args.training_dir, tagger_taste_fn=lambda: taste_tagger(tagger, batches_train), tagger_dev_eval_fn=lambda: eval_tagger(tagger, batches_dev), tagger_save_fn=lambda fname: tagger.save(fname) ) import signal force_eval = {"value": False} def handle_sigquit(signal, frame): logging.debug("Ctrl+\\ recieved, evaluation will be forced.") force_eval["value"] = True pass signal.signal(signal.SIGQUIT, handle_sigquit) logging.debug('Starting training.') try: permuted_batches = [] while train_mgr.should_continue(max_epochs=args.max_epochs): if not permuted_batches: permuted_batches = batches_train[:] random.shuffle(permuted_batches) words, chars, tags, lengths, lemma_chars, chars_lengths = permuted_batches.pop() oov_mask = np.vectorize(lambda x: train_data.vocab.count(x) == 1 and np.random.uniform() < args.oov_sampling_p)(words) words = np.where(oov_mask, np.zeros(words.shape), words) mb_loss = tagger.learn(words, chars, tags, lengths, lemma_chars, chars_lengths) train_mgr.tick(mb_loss=mb_loss, force_eval=force_eval["value"]) force_eval["value"] = False except KeyboardInterrupt: logging.debug("Ctrl+C recieved, stopping training.") run_tagger_and_writeout(tagger, dev_data)
def testQuantifierPriorityAsterisk(self): self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('******'), '*')
def testQuantifierPriorityExclamation(self): self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('!!!!!!'), '!')
def testExpressionValidatorTwoDots(self): with self.assertRaises(SystemExit) as exit_val: Tagger.validateExpressions(['a..b']) self.assertEqual(exit_val.exception.code, 4)
def testCommandValidatorInvalidColor(self): with self.assertRaises(SystemExit) as exit_val: Tagger.validateCommands([['color:qwe@*fefeee']]) self.assertEqual(exit_val.exception.code, 4)
def testQuantifierPriorityExclamationAndPlus(self): self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('+++!++'), '!')
def testExpressionValidatorNegationAndNegation(self): with self.assertRaises(SystemExit) as exit_val: Tagger.validateExpressions(['a!!b']) self.assertEqual(exit_val.exception.code, 4)
def testExpressionValidatorEmptyBrackets(self): with self.assertRaises(SystemExit) as exit_val: Tagger.validateExpressions(['(())']) self.assertEqual(exit_val.exception.code, 4)
def testQuantifierPriorityAsteriskAndPlus(self): self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('+**+*'), '*')
def testQuantifierPriorityPlusAndPipe(self): self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('|++||+'), '+')
def testExpressionValidatorAsteriskAndExclamation(self): with self.assertRaises(SystemExit) as exit_val: Tagger.validateExpressions(['a*!b']) self.assertEqual(exit_val.exception.code, 4)
def testExpressionValidatorPlusAndAsterisk(self): with self.assertRaises(SystemExit) as exit_val: Tagger.validateExpressions(['a+*b']) self.assertEqual(exit_val.exception.code, 4)
def testExpressionValidatorDisjunctionAndConcatenation(self): with self.assertRaises(SystemExit) as exit_val: Tagger.validateExpressions(['a|.b']) self.assertEqual(exit_val.exception.code, 4)
def testQuantifierPriorityDot(self): self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('..........'), '.')
def testQuantifierPriorityExclamationAndDot(self): self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('..!!..'), '!')
def testQuantifierPriorityExclamationAndPipe(self): self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('||!!|'), '!')
def testReformatMultipleQuantifiers(self): self.assertEqual(Tagger.bonusReformatMutlipleQuantifiers('sub+++'), 'sub+')
def testQuantifierPriorityPipe(self): self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('||||'), '|')
def testQuantifierPriorityPlus(self): self.assertEqual(Tagger.bonusMultipleQuantifiersPriority('+++++++'), '+')
def evaluate(n, corpus): """Runs the n-fold validation on a corpus""" if n < 1: n = 10 print("n was to low and has been set to 10\n") # Get all the data sentences, correctTags, tagData = corpusReader(corpus) allCor = [] allIncor = [] for check in range(1, n+1): # Divide all the data divSent = dividList(sentences, n, check) divTags = dividList(correctTags, n, check) divTrain = dividList(tagData, n, check) # To count the total of incorrect and correct tags correctlyTagged = [] incorrectlyTagged = [] print("Check {} doing {}-fold on {}\n".format(check, n, corpus)) # For each part to evaluate for i in range(0, n): # Get the parts to train on trainingParts = divTrain[:i] + divTrain[i+1:] train = [] # They need to be formatted so that we can use the Tagger for index in range(len(trainingParts)): train.extend(trainingParts[index]) # Get the testing and evaluation data testingData = divSent[i] evaluationData = divTags[i] # Do some training uni, bi, tri, word = PB(train) tagger = Tagger(uni, bi, tri, word) # Reset counts correctTagCount = 0 incorrectTagCount = 0 # Go through each sentence and tag it for index in range(len(testingData)): tagged = tagger.tagSentence(testingData[index]) for tag in range(len(tagged)): # If correct if evaluationData[index][tag] == tagged[tag]: correctTagCount += 1 else: incorrectTagCount += 1 # Print to let you know I haven't forgotten about you. print("{}-fold was tagged {}% correctly.".format(i+1, round(correctTagCount / (correctTagCount + incorrectTagCount)*100,2 ))) # Save n-fold counts correctlyTagged.append(correctTagCount) incorrectlyTagged.append(incorrectTagCount) allCor.extend(correctlyTagged) allIncor.extend(incorrectlyTagged) # Total in numbers.. print("\n{} out of {} was correctly tagged.".format(sum(correctlyTagged), sum(correctlyTagged) + sum(incorrectlyTagged))) # .. and percentage print("\nFor a total of {}% correctness.".format(round(sum(correctlyTagged) / (sum(correctlyTagged) + sum(incorrectlyTagged))*100, 2))) # Total in numbers.. print("\n{} out of {} was correctly tagged.".format(sum(allCor), sum(allCor) + sum(allIncor))) # .. and percentage print("\nFor a total of {}% correctness.".format(round(sum(allCor) / (sum(allCor) + sum(allIncor))*100, 2)))
def testCommandValidatorMaxSize(self): Tagger.validateCommands([['size:7']]) self.assertTrue(self)