def setup_module (): global weights_nb global alltags global allwords counters = most_common.get_tags(TRAIN_FILE) for counts in counters.values(): allwords.update(set(counts.keys())) class_counts = most_common.get_class_counts(counters) weights_nb = naivebayes.learnNBWeights(counters,class_counts,allwords) alltags = preproc.getAllTags(TRAIN_FILE)
def setup_module(): global weights_nb global alltags global allwords counters = most_common.get_tags(TRAIN_FILE) for counts in counters.values(): allwords.update(set(counts.keys())) class_counts = most_common.get_class_counts(counters) weights_nb = naivebayes.learnNBWeights(counters, class_counts, allwords) alltags = preproc.getAllTags(TRAIN_FILE)
def get_HMM_weights(trainfile): """Train a set of of log-prob weights using HMM transition model Parameters: trainfile -- The name of the file to train weights Returns: weights -- Weights dict with log-prob of transition and emit features """ # compute naive bayes weights # convert nb weights to hmm weights counters = most_common.get_tags(trainfile) allwords = set() for counts in counters.values(): allwords.update(set(counts.keys())) class_counts = most_common.get_class_counts(counters) nb_weights = naivebayes.learnNBWeights(counters, class_counts, allwords, 0.001) trans_cnt = defaultdict(Counter) with open(trainfile) as instances: prev_tag = START_TAG for line in instances: if len(line.rstrip()) == 0: trans_cnt[prev_tag][END_TAG] += 1 prev_tag = START_TAG continue parts = line.rstrip().split() if len(parts) > 1: cur_tag = parts[1] else: cur_tag = UNKNOWN trans_cnt[prev_tag][cur_tag] += 1 prev_tag = cur_tag if prev_tag != START_TAG: trans_cnt[prev_tag][END_TAG] += 1 hmm_weights = defaultdict(lambda: -1000.) for key in nb_weights: tag = key[0] word = key[1] hmm_weights[(tag, word, EMIT)] = nb_weights[key] for prev_tag in trans_cnt: cnt = trans_cnt[prev_tag] total_pairs = sum(cnt.values()) for cur_tag in cnt: hmm_weights[(cur_tag, prev_tag, TRANS)] = np.log(cnt[cur_tag]) - np.log(total_pairs) return hmm_weights
def get_HMM_weights(trainfile): """Train a set of of log-prob weights using HMM transition model Parameters: trainfile -- The name of the file to train weights Returns: weights -- Weights dict with log-prob of transition and emit features """ # compute naive bayes weights # convert nb weights to hmm weights counters = most_common.get_tags(trainfile) allwords = set() for counts in counters.values(): allwords.update(set(counts.keys())) class_counts = most_common.get_class_counts(counters) nb_weights = naivebayes.learnNBWeights(counters,class_counts,allwords,0.001) trans_cnt = defaultdict(Counter) with open(trainfile) as instances: prev_tag = START_TAG for line in instances: if len(line.rstrip()) == 0: trans_cnt[prev_tag][END_TAG] += 1 prev_tag = START_TAG continue parts = line.rstrip().split() if len(parts) >1: cur_tag = parts[1] else: cur_tag = UNKNOWN trans_cnt[prev_tag][cur_tag] += 1 prev_tag = cur_tag if prev_tag != START_TAG: trans_cnt[prev_tag][END_TAG] += 1 hmm_weights = defaultdict(lambda : -1000.) for key in nb_weights: tag = key[0] word = key[1] hmm_weights[(tag, word, EMIT)] = nb_weights[key] for prev_tag in trans_cnt: cnt = trans_cnt[prev_tag] total_pairs = sum(cnt.values()) for cur_tag in cnt: hmm_weights[(cur_tag, prev_tag, TRANS)] = np.log(cnt[cur_tag]) - np.log(total_pairs) return hmm_weights
def get_HMM_weights(trainfile): """Train a set of of log-prob weights using HMM transition model Parameters: trainfile -- The name of the file to train weights Returns: weights -- Weights dict with log-prob of transition and emit features ngrams("I really like", 2) (I, really) (really, like) (end_tag,'N',trans) => q(stop/N) => (N, end_tag) q(stop/N) = count(stop, N) / count(N) """ # compute naive bayes weights counters = most_common.get_tags(trainfile) class_counts = most_common.get_class_counts(counters) allwords = set() for counts in counters.values(): allwords.update(set(counts.keys())) nb_weights = naivebayes.learnNBWeights(counters, class_counts, allwords, alpha=0.001) # convert nb weights to hmm weights hmm_weights = defaultdict(lambda: -1000.0) for (tag, word), weight in nb_weights.iteritems(): hmm_weights[(tag, word, EMIT)] = weight unigram = preproc.getNgrams(trainfile) bigram = preproc.getNgrams(trainfile, 2) unigramCount = preproc.getAllCounts(unigram) bigramCount = preproc.getAllCounts(bigram) for (tag1, tag2) in bigramCount.keys(): hmm_weights[(tag2, tag1, TRANS)] = np.log(1.0 * bigramCount.get((tag1, tag2), 0)) - np.log( unigramCount.get(tag1, 0) ) return hmm_weights