def word_mapping(sentences, lower=True): """ Create a dictionary and a mapping of words, sorted by frequency. """ words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] context_bef = [] context_aft = [] s1 = [] s2 = [] for s in sentences: for x in s: index = x.index('###') for i in range(7, index): s1.append(x[i].lower()) for i in range(index + 1, len(x) - 1): s2.append(x[i].lower()) context_bef.append(s1) context_aft.append(s2) s1 = [] s2 = [] words.extend(context_bef) words.extend(context_aft) dico = create_dico(words) #dico['<PAD>'] = 10000001 dico['<UNK>'] = 10000000 # dico = {k:v for k,v in dico.items() if v>=3} word_to_id, id_to_word = create_mapping(dico) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in words))) return dico, word_to_id, id_to_word
def morpho_tag_mapping(sentences, morpho_tag_type='wo_root', morpho_tag_column_index=1, joint_learning=False): """ Create a dictionary and a mapping of tags, sorted by frequency. """ if morpho_tag_type == 'char': morpho_tags = [ "".join([w[morpho_tag_column_index] for w in s]) for s in sentences ] morpho_tags += [ww for ww in w[2:-1] for w in s for s in sentences] else: morpho_tags = extract_morpho_tags_ordered( morpho_tag_type, sentences, morpho_tag_column_index, joint_learning=joint_learning) ## TODO: xxx # print morpho_tags #morpho_tags = [[word[1].split("+") for word in s] for s in sentences] # print morpho_tags morpho_tags.append(["*UNKNOWN*"]) dico = create_dico(morpho_tags) # print dico morpho_tag_to_id, id_to_morpho_tag = create_mapping(dico) print morpho_tag_to_id print "Found %i unique morpho tags" % len(dico) return dico, morpho_tag_to_id, id_to_morpho_tag
def feats_mapping(sentences, feat_column): """ Boliang Create a list of dictionary and a list of mappings of features, sorted by frequency. """ assert all([[len(word) == sentences[0][0] for word in s] for s in sentences ]), 'features length are not consistent for all instances.' dico_list = [] feat_to_id_list = [] id_to_feat_list = [] feature_len = len(sentences[0][0]) - feat_column - 1 for i in range(feature_len): feats = [[word[i + feat_column] for word in s] for s in sentences] dico = create_dico(feats) dico['<UNK>'] = 10000000 feat_to_id, id_to_feat = create_mapping(dico) dico_list.append(dico) feat_to_id_list.append(feat_to_id) id_to_feat_list.append(id_to_feat) return dico_list, feat_to_id_list, id_to_feat_list
def tag_mapping(tags): """ Create a dictionary and a mapping of tags, sorted by frequency. """ dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % (len(dico))) return dico, tag_to_id, id_to_tag
def pt_mapping(sentences): pts = [[word[-2] for word in s] for s in sentences] dico = create_dico(pts) dico[' '] = 100000000 pt_to_id, id_to_pt = create_mapping(dico) print "Found %i unique pos tags" % len(dico) #print dico return dico, pt_to_id, id_to_pt
def char_mapping(sentences): """ Create a dictionary and mapping of characters, sorted by frequency. """ chars = ["".join([w[0] for w in s]) for s in sentences] dico = create_dico(chars) char_to_id, id_to_char = create_mapping(dico) print("Found %i unique characters" % len(dico)) return dico, char_to_id, id_to_char
def cluster_mapping(sentences): """ Create a dictionary and mapping of clusters, sorted by frequency. """ clusters = [[word[4] for word in s] for s in sentences] dico = create_dico(clusters) cluster_to_id, id_to_cluster = create_mapping(dico) print "Found %i clusters" % len(dico) return dico, cluster_to_id, id_to_cluster
def POStag_mapping(sentences): """ Create a dictionary and mapping of POS tags, sorted by frequency. """ POStags = [[word[1] for word in s] for s in sentences] dico = create_dico(POStags) POStag_to_id, id_to_POStag = create_mapping(dico) print "Found %i POS tags" % len(dico) return dico, POStag_to_id, id_to_POStag
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[word[-1] for word in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag
def pos_mapping(sentences): """ Create a dictionary and a mapping of pos, sorted by frequency. """ pos = [[word[1] for word in s] for s in sentences] dico = create_dico(pos) pos_to_id, id_to_pos = create_mapping(dico) print "Found %i unique pos" % len(dico) return dico, pos_to_id, id_to_pos
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[word[-1] for word in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % (len(dico))) return dico, tag_to_id, id_to_tag
def pos_mapping(sentences): """ Create a dictionary and a mapping of pos tags, sorted by frequency. """ tags = [[word[2] for word in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print "Found %i unique POS tags" % len(dico) return dico, tag_to_id, id_to_tag
def chunk_mapping(sentences, col=2): """ Create a dictionary and a mapping of chunk tags, sorted by frequency. """ tags = [[word[col] for word in s] for s in sentences] dico = create_dico(tags) chunk_to_id, id_to_chunk = create_mapping(dico) print "Found %i unique Chunk tags" % len(dico) return dico, chunk_to_id, id_to_chunk
def mor_mapping(sentences): """ Create a dictionary and a mapping of pos, sorted by frequency. """ tags = [[word[2] for word in s] for s in sentences] dico = create_dico(tags) dico['<UNK>'] = 10000000 mor_to_id, id_to_mor = create_mapping(dico) print("Found %i unique causality pos" % len(dico)) return dico, mor_to_id, id_to_mor
def dep_verb_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[word[4].split('|')[0] for word in s] for s in sentences] dico = create_dico(tags) dico['<UNK>'] = 10000000 pos_to_id, id_to_pos = create_mapping(dico) print "Found %i unique verb dep words" % len(dico) return dico, pos_to_id, id_to_pos
def pos_mapping(sentences, position=1): """ Create a dictionary and a mapping of poss, sorted by frequency. """ # tags = [[word[position][0] for word in s] for s in sentences] tags = [[word[position] for word in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print "Found %i unique named entity tags" % len(dico) return dico, tag_to_id, id_to_tag
def dep_mapping(sentences): """ Create a dictionary and a mapping of dep tags, sorted by frequency. """ tags = [[word[4] for word in s] for s in sentences] dico = create_dico(tags) print dico tag_to_id, id_to_tag = create_mapping(dico) print "Found %i unique Dependency Role tags" % len(dico) return dico, tag_to_id, id_to_tag
def char_mapping(sentences): """ Create a dictionary and mapping of characters, sorted by frequency. """ chars = [''.join([w[0] for w in sentence]) for sentence in sentences] dico = create_dico(chars) dico['<PAD>'] = 1000000 id_to_char, char_to_id = create_mapping(dico) return dico, char_to_id, id_to_char
def semroles_mapping(sentences): """ Create a dictionary and a mapping of semantic roles labels, sorted by frequency. """ semroles = [[x[4] for x in s] for s in sentences] dico = create_dico(semroles) dico['<UNK>'] = 10000000 semroles_to_id, id_to_semroles = create_mapping(dico) print("Found %i unique semroles (%i in total)" % (len(dico), sum(len(x) for x in semroles))) return dico, semroles_to_id, id_to_semroles
def char_mapping(sentences): """ Create a dictionary and a mapping of chars, sorted by frequency. """ chars = [[x[0] for x in s] for s in sentences] dico = create_dico(chars) dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print "Found %i unique chars (%i in total)" % (len(dico), sum(len(x) for x in chars)) return dico, char_to_id, id_to_char
def char_mapping(sentences): """ Create a dictionary and mapping of characters, sorted by frequency. """ chars = ["".join([w[0] for w in s]) for s in sentences] dico = create_dico(chars) char_to_id, id_to_char = create_mapping(dico) # SWM: replace unseen characters with special symbol (hopefully seen in training) char_to_id = defaultdict(lambda: 0, char_to_id) print "Found %i unique characters" % len(dico) return dico, char_to_id, id_to_char
def conNode_mapping(sentences): """ Create a dictionary and a mapping of chunk labels, sorted by frequency. """ conNode = [[x[2] for x in s] for s in sentences] dico = create_dico(conNode) dico['<UNK>'] = 10000000 conNode_to_id, id_to_conNode = create_mapping(dico) print("Found %i unique conNode (%i in total)" % (len(dico), sum(len(x) for x in conNode))) return dico, conNode_to_id, id_to_conNode
def pos_mapping(sentences): """ Create a dictionary and a mapping of words, sorted by frequency. """ pos = [[x[1] for x in s] for s in sentences] dico = create_dico(pos) dico['<UNK>'] = 10000000 pos_to_id, id_to_pos = create_mapping(dico) print "Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in pos)) return dico, pos_to_id, id_to_pos
def loc_mapping(sentences): """ Create a dictionary and a mapping of location labels, sorted by frequency. """ loc = [[x[6] for x in s] for s in sentences] dico = create_dico(loc) dico['<UNK>'] = 10000000 loc_to_id, id_to_loc = create_mapping(dico) print("Found %i unique location (%i in total)" % (len(dico), sum(len(x) for x in loc))) return dico, loc_to_id, id_to_loc
def head_mapping(sentences): """ Create a dictionary and a mapping of head tags, sorted by frequency. """ tags = [[word[3] for word in s] for s in sentences] dico = create_dico(tags) dico['MAX'] = 10000000 print dico tag_to_id, id_to_tag = create_mapping(dico) print "Found %i unique Head index tags" % len(dico) return dico, tag_to_id, id_to_tag
def pos_mapping(sentences): """ Create a dictionary and a mapping of pos tags, sorted by frequency. """ pos_tags = [[x[1] for x in s] for s in sentences] dico = create_dico(pos_tags) dico['<UNKPOS>'] = sys.maxint pos_tag_to_id, id_to_pos_tag = create_mapping(dico) print "Found %i unique pos tags (%i in total)" % ( len(dico), sum(len(x) for x in pos_tags)) return dico, pos_tag_to_id, id_to_pos_tag
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[word[-1] for word in s] for s in sentences] dico = create_dico(tags) dico[model.START_TAG] = -1 dico[model.STOP_TAG] = -2 tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique causality tags" % len(dico)) return dico, tag_to_id, id_to_tag
def depNode_mapping(sentences): """ Create a dictionary and a mapping of dependency node labels, sorted by frequency. """ depNode = [[x[3] for x in s] for s in sentences] dico = create_dico(depNode) dico['<UNK>'] = 10000000 depNode_to_id, id_to_depNode = create_mapping(dico) print("Found %i unique depNode (%i in total)" % (len(dico), sum(len(x) for x in depNode))) return dico, depNode_to_id, id_to_depNode
def word_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(words) dico['<UNK>'] = 10000000 word_to_id, id_to_word = create_mapping(dico) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in words))) return dico, word_to_id, id_to_word
def tag_mapping(data_path, data_type): """ Create a dictionary and a mapping of tags, sorted by frequency. """ with open(data_path+data_type+"_labels.txt", "r") as file1: tags = [line.split(" ")[:-1] for line in file1.readlines()] dico = create_dico(tags) dico[model.START_TAG] = -1 dico[model.STOP_TAG] = -2 tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[word[-1] for word in s] for s in sentences] dico = create_dico(tags) dico[model.START_TAG] = -1 dico[model.STOP_TAG] = -2 id_to_tag, tag_to_id = create_mapping(dico) return dico, tag_to_id, id_to_tag
def word_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(words) dico['<UNK>'] = 10000000 word_to_id, id_to_word = create_mapping(dico) print("Found %i unique words (%i in total)" % ( len(dico), sum(len(x) for x in words) )) return dico, word_to_id, id_to_word
def word_mapping(sentences,vocabulary_size, pre_train = None): """ Create a dictionary and a mapping of words, sorted by frequency. """ words = [[x[0] for x in s] for s in sentences] dico = create_dico(words) word_to_id, id_to_word = create_mapping(dico, vocabulary_size) print ("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in words)) ) if pre_train: emb_dictionary = read_pre_training(pre_train) for word in dico.iterkeys(): if word not in emb_dictionary: dico[word]=0 dico['<UNK>'] = 10000000 word_to_id, id_to_word = create_mapping(dico, vocabulary_size) return dico, word_to_id, id_to_word