def create_main_feature(qid, word, token, text, entities, tags): tmp_bow = {} tmp_bow['qid'] = qid tmp_bow['TIENE_RAIZ'] = has_root(word) tmp_bow['FULL_MAYUSCULAS'] = full_upper(word) tmp_bow['FULL_MINUSCULAS'] = full_lower(word) tmp_bow['INICIO_MAYUSCULAS_RESTO_MINUSCULAS'] = is_capitalized(word) # print word.encode('utf-8') cat = sin_cat(word, tags) tmp_bow['CAT_SINTACTICA_' + cat] = 1 tmp_bow['PALABRA_'+word] = 1 tmp_bow['PALABRA_LARGO'] = word_len(word) tmp_bow['ES_TOKEN'] = token return tmp_bow
def create_side(words, side, tmp_bow, text): if side == 'left': key = 'IZQUIERDA_' key_cat = 'IZQUIERDA_CAT_' elif side == 'right': key = 'DERECHA_' key_cat = 'DERECHA_CAT_' for w in words: key += key+str(w) cat = sin_cat(w, text) key_cat += key_cat+cat if w != '': tmp_bow[key] = 1 tmp_bow[key_cat] = 1 if key in tmp_bow: tmp_bow[key] += 1 if key_cat in tmp_bow: tmp_bow[key_cat] += 1 return tmp_bow
def test_sin_cat_root(self): word_for_test = "was" text = "I think you have bought the cover story this is propaganda , absolutely opposite the truth , the man was trying to rebuild the western alliance in the threat of rising german aggression ! ." entities = ['/PERSON', '/LOCATION', '/ORGANIZATION'] result = sin_cat(word_for_test, text, entities) self.assertEqual('VBD', result)