Example #1
0
def word_mapping(sentences, lower=True):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    context_bef = []
    context_aft = []
    s1 = []
    s2 = []
    for s in sentences:
        for x in s:
            index = x.index('###')
            for i in range(7, index):
                s1.append(x[i].lower())
            for i in range(index + 1, len(x) - 1):
                s2.append(x[i].lower())
        context_bef.append(s1)
        context_aft.append(s2)
        s1 = []
        s2 = []
    words.extend(context_bef)
    words.extend(context_aft)

    dico = create_dico(words)

    #dico['<PAD>'] = 10000001
    dico['<UNK>'] = 10000000
    # dico = {k:v for k,v in dico.items() if v>=3}
    word_to_id, id_to_word = create_mapping(dico)

    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in words)))
    return dico, word_to_id, id_to_word
Example #2
0
def morpho_tag_mapping(sentences,
                       morpho_tag_type='wo_root',
                       morpho_tag_column_index=1,
                       joint_learning=False):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    if morpho_tag_type == 'char':
        morpho_tags = [
            "".join([w[morpho_tag_column_index] for w in s]) for s in sentences
        ]
        morpho_tags += [ww for ww in w[2:-1] for w in s for s in sentences]
    else:
        morpho_tags = extract_morpho_tags_ordered(
            morpho_tag_type,
            sentences,
            morpho_tag_column_index,
            joint_learning=joint_learning)
        ## TODO: xxx

    # print morpho_tags
    #morpho_tags = [[word[1].split("+") for word in s] for s in sentences]
    # print morpho_tags
    morpho_tags.append(["*UNKNOWN*"])
    dico = create_dico(morpho_tags)
    # print dico
    morpho_tag_to_id, id_to_morpho_tag = create_mapping(dico)
    print morpho_tag_to_id
    print "Found %i unique morpho tags" % len(dico)
    return dico, morpho_tag_to_id, id_to_morpho_tag
Example #3
0
def feats_mapping(sentences, feat_column):
    """
    Boliang
    Create a list of dictionary and a list of mappings of features, sorted by frequency.
    """
    assert all([[len(word) == sentences[0][0] for word in s] for s in sentences
                ]), 'features length are not consistent for all instances.'

    dico_list = []
    feat_to_id_list = []
    id_to_feat_list = []

    feature_len = len(sentences[0][0]) - feat_column - 1

    for i in range(feature_len):
        feats = [[word[i + feat_column] for word in s] for s in sentences]
        dico = create_dico(feats)
        dico['<UNK>'] = 10000000
        feat_to_id, id_to_feat = create_mapping(dico)

        dico_list.append(dico)
        feat_to_id_list.append(feat_to_id)
        id_to_feat_list.append(id_to_feat)

    return dico_list, feat_to_id_list, id_to_feat_list
Example #4
0
def tag_mapping(tags):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % (len(dico)))
    return dico, tag_to_id, id_to_tag
Example #5
0
def pt_mapping(sentences):
    pts = [[word[-2] for word in s] for s in sentences]
    dico = create_dico(pts)
    dico[' '] = 100000000
    pt_to_id, id_to_pt = create_mapping(dico)
    print "Found %i unique pos tags" % len(dico)
    #print dico
    return dico, pt_to_id, id_to_pt
Example #6
0
def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = ["".join([w[0] for w in s]) for s in sentences]
    dico = create_dico(chars)
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique characters" % len(dico))
    return dico, char_to_id, id_to_char
Example #7
0
def cluster_mapping(sentences):
    """
    Create a dictionary and mapping of clusters, sorted by frequency.
    """
    clusters = [[word[4] for word in s] for s in sentences]
    dico = create_dico(clusters)
    cluster_to_id, id_to_cluster = create_mapping(dico)
    print "Found %i clusters" % len(dico)
    return dico, cluster_to_id, id_to_cluster
Example #8
0
def POStag_mapping(sentences):
    """
    Create a dictionary and mapping of POS tags, sorted by frequency.
    """
    POStags = [[word[1] for word in s] for s in sentences]
    dico = create_dico(POStags)
    POStag_to_id, id_to_POStag = create_mapping(dico)
    print "Found %i POS tags" % len(dico)
    return dico, POStag_to_id, id_to_POStag
Example #9
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[-1] for word in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag
Example #10
0
def pos_mapping(sentences):
    """
    Create a dictionary and a mapping of pos, sorted by frequency.
    """
    pos = [[word[1] for word in s] for s in sentences]
    dico = create_dico(pos)
    pos_to_id, id_to_pos = create_mapping(dico)
    print "Found %i unique pos" % len(dico)
    return dico, pos_to_id, id_to_pos
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[-1] for word in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % (len(dico)))
    return dico, tag_to_id, id_to_tag
Example #12
0
def pos_mapping(sentences):
    """
    Create a dictionary and a mapping of pos tags, sorted by frequency.
    """
    tags = [[word[2] for word in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print "Found %i unique POS tags" % len(dico)
    return dico, tag_to_id, id_to_tag
Example #13
0
def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = ["".join([w[0] for w in s]) for s in sentences]
    dico = create_dico(chars)
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique characters" % len(dico))
    return dico, char_to_id, id_to_char
Example #14
0
def chunk_mapping(sentences, col=2):
    """
    Create a dictionary and a mapping of chunk tags, sorted by frequency.
    """
    tags = [[word[col] for word in s] for s in sentences]
    dico = create_dico(tags)
    chunk_to_id, id_to_chunk = create_mapping(dico)
    print "Found %i unique Chunk tags" % len(dico)
    return dico, chunk_to_id, id_to_chunk
Example #15
0
def mor_mapping(sentences):
    """
    Create a dictionary and a mapping of pos, sorted by frequency.
    """
    tags = [[word[2] for word in s] for s in sentences]
    dico = create_dico(tags)
    dico['<UNK>'] = 10000000
    mor_to_id, id_to_mor = create_mapping(dico)
    print("Found %i unique causality pos" % len(dico))
    return dico, mor_to_id, id_to_mor
Example #16
0
def dep_verb_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[4].split('|')[0] for word in s] for s in sentences]
    dico = create_dico(tags)
    dico['<UNK>'] = 10000000
    pos_to_id, id_to_pos = create_mapping(dico)
    print "Found %i unique verb dep words" % len(dico)
    return dico, pos_to_id, id_to_pos
Example #17
0
def pos_mapping(sentences, position=1):
    """
    Create a dictionary and a mapping of poss, sorted by frequency.
    """
    # tags = [[word[position][0] for word in s] for s in sentences]
    tags = [[word[position] for word in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print "Found %i unique named entity tags" % len(dico)
    return dico, tag_to_id, id_to_tag
Example #18
0
def dep_mapping(sentences):
    """
    Create a dictionary and a mapping of dep tags, sorted by frequency.
    """
    tags = [[word[4] for word in s] for s in sentences]
    dico = create_dico(tags)
    print dico
    tag_to_id, id_to_tag = create_mapping(dico)
    print "Found %i unique Dependency Role tags" % len(dico)
    return dico, tag_to_id, id_to_tag
Example #19
0
def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = [''.join([w[0] for w in sentence]) for sentence in sentences]
    
    dico = create_dico(chars)
    dico['<PAD>'] = 1000000

    id_to_char, char_to_id = create_mapping(dico)
    return dico, char_to_id, id_to_char
Example #20
0
def semroles_mapping(sentences):
    """
    Create a dictionary and a mapping of semantic roles labels, sorted by frequency.
    """
    semroles = [[x[4] for x in s] for s in sentences]
    dico = create_dico(semroles)
    dico['<UNK>'] = 10000000
    semroles_to_id, id_to_semroles = create_mapping(dico)
    print("Found %i unique semroles (%i in total)" %
          (len(dico), sum(len(x) for x in semroles)))
    return dico, semroles_to_id, id_to_semroles
Example #21
0
def char_mapping(sentences):
    """
    Create a dictionary and a mapping of chars, sorted by frequency.
    """
    chars = [[x[0] for x in s] for s in sentences]
    dico = create_dico(chars)
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print "Found %i unique chars (%i in total)" % (len(dico),
                                                   sum(len(x) for x in chars))
    return dico, char_to_id, id_to_char
Example #22
0
def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = ["".join([w[0] for w in s]) for s in sentences]
    dico = create_dico(chars)
    char_to_id, id_to_char = create_mapping(dico)
    # SWM: replace unseen characters with special symbol (hopefully seen in training)
    char_to_id = defaultdict(lambda: 0, char_to_id)
    print "Found %i unique characters" % len(dico)
    return dico, char_to_id, id_to_char
Example #23
0
def conNode_mapping(sentences):
    """
    Create a dictionary and a mapping of chunk labels, sorted by frequency.
    """
    conNode = [[x[2] for x in s] for s in sentences]
    dico = create_dico(conNode)
    dico['<UNK>'] = 10000000
    conNode_to_id, id_to_conNode = create_mapping(dico)
    print("Found %i unique conNode (%i in total)" %
          (len(dico), sum(len(x) for x in conNode)))
    return dico, conNode_to_id, id_to_conNode
Example #24
0
def pos_mapping(sentences):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    pos = [[x[1] for x in s] for s in sentences]
    dico = create_dico(pos)
    dico['<UNK>'] = 10000000
    pos_to_id, id_to_pos = create_mapping(dico)
    print "Found %i unique words (%i in total)" % (len(dico),
                                                   sum(len(x) for x in pos))
    return dico, pos_to_id, id_to_pos
Example #25
0
def loc_mapping(sentences):
    """
    Create a dictionary and a mapping of location labels, sorted by frequency.
    """
    loc = [[x[6] for x in s] for s in sentences]
    dico = create_dico(loc)
    dico['<UNK>'] = 10000000
    loc_to_id, id_to_loc = create_mapping(dico)
    print("Found %i unique location (%i in total)" %
          (len(dico), sum(len(x) for x in loc)))
    return dico, loc_to_id, id_to_loc
Example #26
0
def head_mapping(sentences):
    """
    Create a dictionary and a mapping of head tags, sorted by frequency.
    """
    tags = [[word[3] for word in s] for s in sentences]
    dico = create_dico(tags)
    dico['MAX'] = 10000000
    print dico
    tag_to_id, id_to_tag = create_mapping(dico)
    print "Found %i unique Head index tags" % len(dico)
    return dico, tag_to_id, id_to_tag
Example #27
0
def pos_mapping(sentences):
    """
    Create a dictionary and a mapping of pos tags, sorted by frequency.
    """
    pos_tags = [[x[1] for x in s] for s in sentences]
    dico = create_dico(pos_tags)
    dico['<UNKPOS>'] = sys.maxint
    pos_tag_to_id, id_to_pos_tag = create_mapping(dico)
    print "Found %i unique pos tags (%i in total)" % (
        len(dico), sum(len(x) for x in pos_tags))
    return dico, pos_tag_to_id, id_to_pos_tag
Example #28
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[-1] for word in s] for s in sentences]
    dico = create_dico(tags)
    dico[model.START_TAG] = -1
    dico[model.STOP_TAG] = -2
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique causality tags" % len(dico))
    return dico, tag_to_id, id_to_tag
Example #29
0
def depNode_mapping(sentences):
    """
    Create a dictionary and a mapping of dependency node labels, sorted by frequency.
    """
    depNode = [[x[3] for x in s] for s in sentences]
    dico = create_dico(depNode)
    dico['<UNK>'] = 10000000
    depNode_to_id, id_to_depNode = create_mapping(dico)
    print("Found %i unique depNode (%i in total)" %
          (len(dico), sum(len(x) for x in depNode)))
    return dico, depNode_to_id, id_to_depNode
Example #30
0
def word_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(words)
    dico['<UNK>'] = 10000000
    word_to_id, id_to_word = create_mapping(dico)
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in words)))
    return dico, word_to_id, id_to_word
def tag_mapping(data_path, data_type):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    with open(data_path+data_type+"_labels.txt", "r") as file1:
        tags = [line.split(" ")[:-1] for line in file1.readlines()]
    dico = create_dico(tags)
    dico[model.START_TAG] = -1
    dico[model.STOP_TAG] = -2
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag
Example #32
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[-1] for word in s] for s in sentences]
    
    dico = create_dico(tags)
    dico[model.START_TAG] = -1
    dico[model.STOP_TAG] = -2

    id_to_tag, tag_to_id = create_mapping(dico)
    return dico, tag_to_id, id_to_tag
Example #33
0
def word_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(words)
    dico['<UNK>'] = 10000000
    word_to_id, id_to_word = create_mapping(dico)
    print("Found %i unique words (%i in total)" % (
        len(dico), sum(len(x) for x in words)
    ))
    return dico, word_to_id, id_to_word
def word_mapping(sentences,vocabulary_size, pre_train = None):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[0] for x in s] for s in sentences]
    dico = create_dico(words)
    word_to_id, id_to_word = create_mapping(dico, vocabulary_size)
    print ("Found %i unique words (%i in total)" %
        (len(dico), sum(len(x) for x in words))
    )

    if pre_train:
        emb_dictionary = read_pre_training(pre_train)
        for word in dico.iterkeys():
        	  if word not in emb_dictionary:
        	  	  dico[word]=0

    dico['<UNK>'] = 10000000
    word_to_id, id_to_word = create_mapping(dico, vocabulary_size)
    return dico, word_to_id, id_to_word