def inference_dset(model: pl.LightningModule, dset: SignSequenceDataset): # list of sequences with elements (pred_class, true_class) labeled = [] for i in trange(len(dset)): images, targets, _ = dset[i] logits = model(images) _, pred = logits.topk(1, dim=1) # print(pred) predicted_signs = pred[:, 0].tolist() # print(predicted_signs) target_ints = [int(targ) for targ in targets.tolist()] labeled.append(list(zip(predicted_signs, target_ints))) return labeled labeled_train = inference_dset(trained, dset_train) labeled_val = inference_dset(trained, dset_val) hmm_trainer = HiddenMarkovModelTrainer() hmm_tagger = hmm_trainer.train(labeled_sequences=labeled_train) hmm_tagger.test(labeled_val, verbose=False)
for file in glob.glob("twitie-tagger/corpora/*agree"): print (file) f=open(file) lines = [line.strip().split() for line in f] f.close() tokenized_docs = tokenized_docs + [[word.split("_")[-2:] for word in line if len(word)>1] for line in lines] tokenized_docs_tuples = [[tuple(word) for word in line] for line in tokenized_docs] for sent in tokenized_docs_tuples: for word in sent: if len(word) != 2: print (word) words = [word[0] for line in tokenized_docs for word in line] wordsVocab = list(set(words)) states = [word[1] for line in tokenized_docs for word in line if len(word)>1] statesVocab = list(set(states)) #HMMtrainer = HiddenMarkovModelTrainer(states=statesVocab,symbols=wordsVocab) HMMtrainer = HiddenMarkovModelTrainer() hmmmodel = HMMtrainer.train(tokenized_docs_tuples) #print (hmmmodel.tag("wtf did u do ?".split())) #sentence = "my home is burning".split() #print (hmmmodel.tag(sentence))