def get_tag_word_counts(trainfile): """ Produce a Counter of occurences of word for each tag Parameters: trainfile: -- the filename to be passed as argument to conll_seq_generator :returns: -- a default dict of counters, where the keys are tags. """ all_counters = defaultdict(lambda: Counter()) # Put some code here! for i,(word,tag) in enumerate(conll_seq_generator(trainfile)): for j,k in zip(word,tag): all_counters[k][j] += 1 #all_counters[j][k] += 1 return all_counters
def setup(): global word_to_ix, tag_to_ix, X_tr, Y_tr, model, embedding_dim vocab, word_to_ix = most_common.get_word_to_ix(TRAIN_FILE, max_size=6500) tag_to_ix={} for i,(words,tags) in enumerate(preprocessing.conll_seq_generator(TRAIN_FILE)): for tag in tags: if tag not in tag_to_ix: tag_to_ix[tag] = len(tag_to_ix) torch.manual_seed(765); embedding_dim=30 hidden_dim=30 model = bilstm.BiLSTM(len(word_to_ix),tag_to_ix,embedding_dim, hidden_dim) X_tr = [] Y_tr = [] for i,(words,tags) in enumerate(preprocessing.conll_seq_generator(TRAIN_FILE)): X_tr.append(words) Y_tr.append(tags)
def setup(): global word_to_ix, tag_to_ix, X_tr, Y_tr, model, embedding_dim vocab, word_to_ix = most_common.get_word_to_ix(TRAIN_FILE, max_size=6500) X_tr = [] Y_tr = [] for i,(words,tags) in enumerate(preprocessing.conll_seq_generator(TRAIN_FILE)): X_tr.append(words) Y_tr.append(tags) torch.manual_seed(765); embedding_dim=30 hidden_dim=30 model = cbow.CBOW(len(vocab), embedding_dim)
def get_tag_trans_counts(input_file): """compute a dict of counters for tag transitions :param trainfile: name of file containing training data :returns: dict, in which keys are tags, and values are counters of succeeding tags :rtype: dict """ tot_counts = defaultdict(lambda: Counter()) for index, (words, tags) in enumerate(conll_seq_generator(input_file)): for index, tag in enumerate(tags): if index == 0: tot_counts[START_TAG].update([tag]) if index == len(tags) - 1: tot_counts[tag].update([END_TAG]) else: tot_counts[tag].update([tags[index + 1]]) return dict(tot_counts)
def get_nb_weights(trainfile, smoothing): """ estimate_nb function assumes that the labels are one for each document, where as in POS tagging: we have labels for each particular token. So, in order to calculate the emission score weights: P(w|y) for a particular word and a token, we slightly modify the input such that we consider each token and its tag to be a document and a label. The following helper code converts the dataset to token level bag-of-words feature vector and labels. The weights obtained from here will be used later as emission scores for the viterbi tagger. inputs: train_file: input file to obtain the nb_weights from smoothing: value of smoothing for the naive_bayes weights :returns: nb_weights: naive bayes weights """ token_level_docs = [] token_level_tags = [] for words, tags in preprocessing.conll_seq_generator(trainfile): token_level_docs += [{word: 1} for word in words] token_level_tags += tags nb_weights = estimate_nb(token_level_docs, token_level_tags, smoothing) return nb_weights
def get_word_to_ix(input_file, max_size=100000): """ creates a vocab that has the list of most frequent occuring words such that the size of the vocab <=max_size, also adds an UNK token to the Vocab and then creates a dictionary that maps each word to a unique index, :returns: vocab, dict vocab: list of words in the vocabulary dict: maps word to unique index """ vocab_counter=Counter() for words,tags in conll_seq_generator(input_file): for word,tag in zip(words,tags): vocab_counter[word]+=1 vocab = [ word for word,val in vocab_counter.most_common(max_size-1)] vocab.append(UNK) word_to_ix={} ix=0 for word in vocab: word_to_ix[word]=ix ix+=1 return vocab, word_to_ix
def get_tag_to_ix(input_file): """ creates a dictionary that maps each tag (including the START_TAG and END_TAG to a unique index and vice-versa :returns: dict1, dict2 dict1: maps tag to unique index dict2: maps each unique index to its own tag """ tag_to_ix={} for i,(words,tags) in enumerate(conll_seq_generator(input_file)): for tag in tags: if tag not in tag_to_ix: tag_to_ix[tag] = len(tag_to_ix) #adding START_TAG and END_TAG #if START_TAG not in tag_to_ix: # tag_to_ix[START_TAG] = len(tag_to_ix) #if END_TAG not in tag_to_ix: # tag_to_ix[END_TAG] = len(tag_to_ix) ix_to_tag = {v:k for k,v in tag_to_ix.items()} return tag_to_ix, ix_to_tag