def process_gloss_pos_line(inst, word_tag_dict, outfile): """ Process the gloss pos line. :param inst: :param word_tag_dict: """ # Grab the gloss POS tier... gpos_tier = inst.find(alignment=GLOSS_WORD_ID, type=POS_TIER_TYPE) # If this tier exists, then let's process it. if gpos_tier is not None: # Iterate over each gloss POS tag... for gpos in gpos_tier: # Skip this tag if for some reason it doesn't align with # a gloss word. if ALIGNMENT not in gpos.attributes or not gpos.alignment: EXTRACT_LOG.debug("No alignment found for {} in tier {} igt {}".format(gpos.id, gpos.tier.id, gpos.igt.id)) continue word = gpos.igt.find(id=gpos.alignment).value() tag = gpos.value() # Write out the features... t = GoldTagPOSToken(word, goldlabel=tag) write_gram(t, feat_prev_gram=False, feat_next_gram=False, lowercase=True, output=outfile)
def process_dicts(class_path): c = pickle.load(open(c_path, "rb")) d = pickle.load(open(d_path, "rb")) m = pickle.load(open(m_path, "rb")) print(len(c), len(d), len(m)) # Threshold: thresh = 30 # Now, we want to write out every word that we've seen at least 3 times. out_path = os.path.join(proj_root, "odin_feats.txt") out_f = open(out_path, "w", encoding="utf-8") for i, w in enumerate(d.keys()): if d[w].total() < thresh: LOG.debug("Skipping {}".format(w)) else: LOG.debug("Testing {}".format(w)) for tag in d[w].keys(): LOG.debug("Writing out tag for {}-{}".format(w, tag)) t = GoldTagPOSToken(w, goldlabel=tag) write_gram(t, output=out_f, feat_next_gram=False, feat_prev_gram=False, lowercase=True) out_f.flush() out_f.close() train_txt(out_path, class_path)
def chunk_to_features(chunk, tag_method=None, posdict=None, context_feats=False): """ Method to extract the gloss-line classifier features from a subset of instances. (Useful for parallelizing) :param inst: :type inst: RGIgt :param tag_method: :param posdict: :param feat_path: :param context_feats: """ out_string = StringIO() num_instances = 0 # Look for the GLOSS_POS tier for inst in chunk: gpos_tier = inst.get_pos_tags(GLOSS_WORD_ID, tag_method=tag_method) if gpos_tier: num_instances += 1 # For each token in the tier... for i, gp in enumerate(gpos_tier): if ALIGNMENT not in gp.attributes: continue word = gp.igt.find(id=gp.attributes[ALIGNMENT]).value() tag = gp.value() prev_word = None next_word = None if context_feats: if i > 0: prev_word = gp.igt.find(id=gpos_tier[i - 1].attributes[ALIGNMENT]).value() if i < len(gpos_tier) - 1: next_word = gp.igt.find(id=gpos_tier[i + 1].attributes[ALIGNMENT]).value() # Write out features... t = GoldTagPOSToken(word, goldlabel=tag) write_gram( t, feat_prev_gram=context_feats, feat_next_gram=context_feats, prev_gram=prev_word, next_gram=next_word, lowercase=True, output=out_string, posdict=posdict, ) return out_string.getvalue(), num_instances
def write_out_gram_dict(subword_dict, feat_path, feat_list, threshold = 1): """ Given the gram+tag dict, write out grams for those that have been seen enough to meet our threshold. :param subword_dict: :type subword_dict: TwoLevelCountDict :param feat_path: :param class_path: """ EXTRACT_LOG.log(NORM_LEVEL, 'Writing out svm-lite style features to "{}"...'.format(feat_path)) feat_file = open(feat_path, 'w', encoding='utf-8') # Load the posdict if needed... pd = load_posdict() if (CLASS_FEATS_DICT in feat_list) or (CLASS_FEATS_PDICT in feat_list) or (CLASS_FEATS_NDICT in feat_list) else False for subword in subword_dict.keys(): for tag in subword_dict[subword].keys(): # Write out the gram with this tag as many times as it appears... for prev_word, next_word in subword_dict[subword][tag]['contexts']: gt = GoldTagPOSToken(subword, goldlabel=tag) # ------------------------------------------- # Now, vary the features depending on whats in the list # ------------------------------------------- write_gram(gt, lowercase=True, feat_next_gram=CLASS_FEATS_NEXSW in feat_list, feat_prev_gram=CLASS_FEATS_PRESW in feat_list, feat_suffix=CLASS_FEATS_SUF in feat_list, feat_prefix=CLASS_FEATS_PRE in feat_list, feat_has_number=CLASS_FEATS_NUM in feat_list, feat_morph_num=CLASS_FEATS_NUMSW in feat_list, feat_prev_gram_dict=CLASS_FEATS_PDICT in feat_list, feat_next_gram_dict=CLASS_FEATS_NDICT in feat_list, feat_basic=CLASS_FEATS_SW in feat_list, feat_dict=CLASS_FEATS_DICT in feat_list, posdict=pd, next_gram=next_word, prev_gram=prev_word, output=feat_file) feat_file.close() EXTRACT_LOG.log(NORM_LEVEL, 'Written')