def extract_positional_bigram_features(window, mid_ix, feature_val=1): bi_grams = compute_ngrams(window, max_len=2, min_len=2) d = {} for i, bi_gram in enumerate(bi_grams): d["BI" + ":" + str(-mid_ix + i) + " " + bi_gram[0] + " | " + bi_gram[1]] = feature_val return d
def extend_chains(chains): ext_chains = set() for tokens in chains: ext_chains.add(",".join(tokens)) ngrams = compute_ngrams(tokens, max_len=None, min_len=3) for t in ngrams: ext_chains.add(",".join(t)) return ext_chains
def get_vector_space(self, tokenized_docs): def collapse(tag, ngram): return tag + ":" + "-".join(ngram) collapse_ngram = lambda ngram: collapse("ng", ngram) collapse_skip_gram = lambda ngram: collapse("sg", ngram) data = [] df_tally = defaultdict(int) for doc in tokenized_docs: ngrams = compute_ngrams(doc, 2, 2) #skip_grams = compute_skip_grams(doc, 5) skip_grams = [] example = doc + map(collapse_ngram, ngrams) + map( collapse_skip_gram, skip_grams) data.append(example) # compute doc freq s = set(example) for item in s: df_tally[item] += 1 # Remove low ngram counts processed_data = [] for example in data: row = [] for term in example: if df_tally[term] >= self.min_word_count: row.append(term) processed_data.append(row) del data # prevent bugs due to later access lat_vector_model = self.vector_space_func(processed_data) """ Set to 1.0 if false """ self.df = defaultdict(lambda: 1.0) if self.use_idf: for k, v in df_tally.items(): self.df[k] = np.log(v + 1) collapsed = [] for example in processed_data: vectors = [] for token in example: v = lat_vector_model.project(token) if v != None: """ Idf value will be 1.0 if False """ vectors.append( np.array(v) * self.num_topics / self.df[token]) collapse = self.func(vectors) collapsed.append(collapse) print "Constructed Vector Space" return (collapsed, dict())
def get_vector_space(self, tokenized_docs): def collapse(tag, ngram): return tag + ":" + "-".join(ngram) collapse_ngram = lambda ngram: collapse("ng", ngram) collapse_skip_gram = lambda ngram: collapse("sg", ngram) data = [] df_tally = defaultdict(int) for doc in tokenized_docs: ngrams = compute_ngrams(doc, 2, 2) #skip_grams = compute_skip_grams(doc, 5) skip_grams = [] example = doc + map(collapse_ngram, ngrams) + map(collapse_skip_gram, skip_grams) data.append(example) # compute doc freq s = set(example) for item in s: df_tally[item] += 1 # Remove low ngram counts processed_data = [] for example in data: row = [] for term in example: if df_tally[term] >= self.min_word_count: row.append(term) processed_data.append(row) del data # prevent bugs due to later access lat_vector_model = self.vector_space_func(processed_data) """ Set to 1.0 if false """ self.df = defaultdict(lambda: 1.0) if self.use_idf: for k,v in df_tally.items(): self.df[k] = np.log(v + 1) collapsed = [] for example in processed_data: vectors = [] for token in example: v = lat_vector_model.project(token) if v != None: """ Idf value will be 1.0 if False """ vectors.append(np.array(v) * self.num_topics / self.df[token]) collapse = self.func(vectors) collapsed.append(collapse) print "Constructed Vector Space" return (collapsed, dict())
def bigram_features(window, mid_ix=None, feature_val = 1): """ window : list of str words in window mid_ix : int position of word to predict feature_val : Any value for feature returns : dct dct[str]:val Extracts bi-gram word features, IGNORING POSITION """ bi_grams = compute_ngrams(window, max_len = 2, min_len = 2) d = dict() for bi_gram in bi_grams: d["BI" + ":" + " " + bi_gram[0] + " | " + bi_gram[1]] = feature_val return d
def tagged_sents_to_word_windows(tagged_sents, window_size): offset = int((window_size - 1) / 2) tagged_windows = [] for sent in tagged_sents: wds, tags = zip(*sent) wds = list(wds) # pad sentence for _ in range(offset): wds.insert(0, SENT_START) wds.append(SENT_END) windows = compute_ngrams(wds, max_len=window_size, min_len=window_size) #numbered_windows = map(window_to_sequence, windows) #tagged = zip(numbered_windows, tags) tagged = zip(windows, tags) tagged_windows.extend(tagged) return tagged_windows
def trigram_features(window, mid_ix=None, feature_val = 1): """ window : list of str words in window mid_ix : int position of word to predict feature_val : Any value for feature returns : dct dct[str]:val Extracts tri-gram word features, IGNORING POSITION """ tri_grams = compute_ngrams(window, max_len = 3, min_len = 3) d = {} for tri_gram in tri_grams: tri_gram_key = tri_gram[0] + " | " + tri_gram[1] + "|" + tri_gram[2] d["TRI" + ":" + " " + tri_gram_key] = feature_val return d
def positional_bigram_features(window, mid_ix=None, feature_val = 1): """ window : list of str words in window mid_ix : int position of word to predict feature_val : Any value for feature returns : dct dct[str]:val Extracts bi-gram word features, INCLUDING POSITION """ if mid_ix is None: mid_ix = compute_middle_index(window) bi_grams = compute_ngrams(window, max_len = 2, min_len = 2) d = {} for i, bi_gram in enumerate(bi_grams): d["P_BI" + ":" + str(-mid_ix + i) + " " + bi_gram[0] + " | " + bi_gram[1]] = feature_val return d
def extract_ngram_features_stemmed(offset, ngram_size, input, val=1): """ offset : int the number of words either side of the input to extract features from ngram_size : int the size of the ngrams input : FeatureExtactorInput input to feature extractor returns : dict dictionary of features """ feats = {} end = len(input.sentence) - 1 # fix to within bounds only start = max(0, input.wordix - offset) stop = min(end, input.wordix + offset) window = list(input.sentence[start:stop + 1]) window = list(map(stem, window)) if input.wordix < offset: diff = offset - input.wordix for i in range(diff): window.insert(0, __START__) if input.wordix + offset > end: diff = input.wordix + offset - end for i in range(diff): window.append(__END__) ngrams = compute_ngrams(window, ngram_size, ngram_size) str_num_ngrams = str(ngram_size) for i, offset_ngram in enumerate(ngrams): relative_offset = str(i - offset) str_ngram = ",".join(offset_ngram) feats["POS_" + str_num_ngrams + "GRAMS:" + relative_offset + "->" + str_ngram] = val return feats
def extract_ngram_features_stemmed(offset, ngram_size, input, val = 1): """ offset : int the number of words either side of the input to extract features from ngram_size : int the size of the ngrams input : FeatureExtactorInput input to feature extractor returns : dict dictionary of features """ feats = {} end = len(input.sentence) - 1 # fix to within bounds only start = max(0, input.wordix - offset) stop = min(end, input.wordix + offset) window = list(input.sentence[start:stop+1]) window = list(map(stem, window)) if input.wordix < offset: diff = offset - input.wordix for i in range(diff): window.insert(0,__START__) if input.wordix + offset > end: diff = input.wordix + offset - end for i in range(diff): window.append(__END__) ngrams = compute_ngrams(window, ngram_size, ngram_size) str_num_ngrams = str(ngram_size) for i, offset_ngram in enumerate(ngrams): relative_offset = str(i - offset) str_ngram = ",".join(offset_ngram) feats["POS_" + str_num_ngrams + "GRAMS:" + relative_offset + "->" + str_ngram] = val return feats
def extract_ngram_features(tokens, idx): feats = [] end = len(tokens) - 1 # fix to within bounds only start = max(0, idx - offset) stop = min(end, idx + offset) prefix = "" if stem_words: prefix = "STEM_" window = list(tokens[start:stop + 1]) if stem_words: window = map(lambda x: stem(x), window) if idx < offset: diff = offset - idx for i in range(diff): window.insert(0, __START__) if idx + offset > end: diff = idx + offset - end for i in range(diff): window.append(__END__) ngrams = compute_ngrams(window, ngram_size, ngram_size) str_num_ngrams = str(ngram_size) for i, offset_ngram in enumerate(ngrams): if positional: relative_offset = str(i - offset) else: relative_offset = "BOW" str_ngram = ",".join(offset_ngram) feats.append(prefix + "POS_" + str_num_ngrams + "_GRAMS:" + relative_offset + "->" + str_ngram) return feats
def extract_positional_bigram_features(window, mid_ix, feature_val = 1): bi_grams = compute_ngrams(window, max_len = 2, min_len = 2) d = {} for i, bi_gram in enumerate(bi_grams): d["BI" + ":" + str(-mid_ix + i) + " " + bi_gram[0] + " | " + bi_gram[1]] = feature_val return d
def extract(self, words: List[str]) -> List[str]: stemmed_words = [self.stem(word) for word in words] stemmed_ngrams = compute_ngrams(tokens=stemmed_words, max_len=self.max_ngram_len, min_len=1) # type: List[List[str]] return [("--".join(ngram)).lower() for ngram in stemmed_ngrams]
def extract(self, words: List[str]) -> List[str]: ngrams = compute_ngrams(tokens=words, max_len=self.max_ngram_len, min_len=1) # type: List[List[str]] return [("--".join(ngram)).lower() for ngram in ngrams]
def get_conditional_feats(self, action_history, action_tag_pair_history, tos, buffer, previous_tags, subsequent_tags): feats = {} if len(action_history) == 0: feats["first_action"] = self.positive_val if len(subsequent_tags) == 0: feats["last_tag"] = 1 feats["num_actions"] = len(action_history) feats["num_prev_tags"] = len(previous_tags) feats["num_subsequent_tags"] = len(subsequent_tags) feats["num_tags"] = 1 + len(previous_tags) + len(subsequent_tags) feats["tos:" + tos] = self.positive_val feats["buffer:" + buffer] = self.positive_val feats["tos_buffer:" + tos + "|" + buffer] = self.positive_val feats["tos_buffer_combo:" + ",".join(sorted([tos, buffer]))] = self.positive_val ### PREVIOUS TAGS for i, tag in enumerate(previous_tags[::-1]): feats["prev_tag-{i}:{tag}".format(i=i, tag=tag)] = self.positive_val feats["prev_tag:{tag}".format(tag=tag)] = self.positive_val if len(previous_tags) > 0: feats["prev-tag-tos-buffer:{tag}_{tos}_{buffer}".format(tag=previous_tags[-1], tos=tos, buffer=buffer)] = self.positive_val feats["prev-tag-buffer:{tag}_{buffer}".format(tag=previous_tags[-1], buffer=buffer)] = self.positive_val feats["prev-tag-tos:{tag}_{tos}".format(tag=previous_tags[-1], tos=tos)] = self.positive_val bigrams = compute_ngrams(previous_tags, 2, 2) for i, bigram in enumerate(bigrams[::-1]): feats["prev_bigram-tag-{i}:{tag}".format(i=i, tag=str(bigram))] = self.positive_val feats["prev_bigram-tag:{tag}".format(tag=str(bigram))] = self.positive_val ### REMAINING TAGS for i, tag in enumerate(subsequent_tags): feats["subseq_tag-{i}:{tag}".format(i=i, tag=tag)] = self.positive_val feats["subseq_tag:{tag}".format(i=i, tag=tag)] = self.positive_val if len(subsequent_tags) > 0: feats["subseq-tag-tos-buffer:{tag}_{buffer}".format(tag=subsequent_tags[0], tos=tos, buffer=buffer)] = self.positive_val feats["subseq-tag-buffer:{tag}_{buffer}".format(tag=subsequent_tags[0], buffer=buffer)] = self.positive_val feats["subseq-tag-tos:{tag}_{tos}".format(tag=subsequent_tags[0], tos=tos)] = self.positive_val bigrams = compute_ngrams(subsequent_tags, 2, 2) for i, bigram in enumerate(bigrams): feats["subseq_bigram-tag-{i}:{tag}".format(i=i, tag=str(bigram))] = self.positive_val feats["subseq_bigram-tag:{tag}".format(tag=str(bigram))] = self.positive_val # features for each previous action action_tally = defaultdict(int) for i, action in enumerate(action_history[::-1]): feats["action-{i}:{action}".format(i=i, action=action)] = self.positive_val feats["action:{action}".format(action=action)] = self.positive_val action_tally[action] += 1 # Features for the number of times each action has been performed for action, count in action_tally.items(): feats["action-tally:{action}_{count}".format(action=action, count=count)] = self.positive_val if len(action_history) > 0: feats["prev_action-tos-buffer:{action}_{tos}_{buffer}".format(action=action_history[-1], tos=tos, buffer=buffer)] = self.positive_val feats["prev_action-buffer:{action}_{buffer}".format(action=action_history[-1], buffer=buffer)] = self.positive_val feats["prev_action-tos:{action}_{tos}".format(action=action_history[-1], tos=tos)] = self.positive_val bigrams = compute_ngrams(action_history, 2, 2) for i, bigram in enumerate(bigrams[::-1]): feats["prev_bigram_action-{i}:{tag}".format(i=i, tag=str(bigram))] = self.positive_val feats["prev_bigram_action:{tag}".format(tag=str(bigram))] = self.positive_val for i, (action, prev_tos, prev_buffer) in enumerate(action_tag_pair_history[::-1]): feats["actiontag-{i}:{action}_{tos}_{buffer}".format(i=i, action=action, tos=prev_tos, buffer=prev_buffer)] = self.positive_val feats["actiontag:{action}_{tos}_{buffer}".format(action=action, tos=prev_tos, buffer=prev_buffer)] = self.positive_val feats["actiontos-{i}:{action}_{tos}".format(i=i, action=action, tos=prev_tos)] = self.positive_val feats["actiontos:{action}_{tos}".format(action=action, tos=prev_tos)] = self.positive_val feats[ "actionbuffer-{i}:{action}_{buffer}".format(i=i, action=action, buffer=prev_buffer)] = self.positive_val feats["actionbuffer:{action}_{buffer}".format(action=action, buffer=prev_buffer)] = self.positive_val if len(action_tag_pair_history) > 0: action, prev_tos, prev_buffer = action_tag_pair_history[-1] feats[ "prev_actiontag_tos_buffer_currnet_tos_current_buffer:{action}_{prev_tos}_{prev_buffer}_{tos}_{buffer}".format( action=action, prev_tos=prev_tos, prev_buffer=prev_buffer, tos=tos, buffer=buffer)] = self.positive_val feats["prev_actiontag_tos_buffer_current_buffer:{action}_{prev_tos}_{prev_buffer}_{buffer}".format( action=action, prev_tos=prev_tos, prev_buffer=prev_buffer, buffer=buffer)] = self.positive_val feats["prev_actiontag_tos_buffer_current_tos:{action}_{prev_tos}_{prev_buffer}_{tos}".format(action=action, prev_tos=prev_tos, prev_buffer=prev_buffer, tos=tos)] = self.positive_val return feats