Ejemplo n.º 1
0
print('INFO: Loading annotated data.')
annot_data = list()
relations = list()
with open(par.a, mode='r', encoding='U8') as f:
    for line in f:
        lab, parent, child = line.rstrip('\n').split('\t')
        relations.append((parent, child))
        cust_feat = dict()
        if par.r:
            cust_feat = dict(custom_features[(parent, child)])
        child = child.split('_')
        parent = parent.split('_')
        lab = True if lab == '+' else False
        features = make_vector(parent=parent[0],
                               parent_pos=parent[1],
                               child=child[0],
                               child_pos=child[1],
                               custom=cust_feat)
        annot_data.append({**features, **{'result': lab}})

# split annotated data on train/validation/holdout
divided = split_data(relations,
                     train=0.65,
                     validation=0.15,
                     holdout=0.2,
                     random_seed=24)
for item in annot_data:
    parent = item['parent'] + '_' + item['parentPos']
    child = item['child'] + '_' + item['childPos']
    item['data'] = divided[(parent, child)]
# load data and assign features
print('INFO: Loading annotated data.')
annot_data = list()
relations = list()
with open(par.a, mode='r', encoding='U8') as f:
    for line in f:
        lab, parent, child = line.rstrip('\n').split('\t')
        relations.append((parent, child))
        child = child.split('_')
        parent = parent.split('_')
        lab = True if lab == '+' else False
        features = make_vector(parent=parent[1],
                               parent_pos=parent[2][0],
                               child=child[1],
                               child_pos=child[2][0],
                               custom={
                                   'Pid': parent[0],
                                   'Cid': child[0]
                               })
        annot_data.append({**features, **{'result': lab}})

# split annotated data on train/validation/holdout
divided = split_data(relations,
                     train=0.65,
                     validation=0.15,
                     holdout=0.2,
                     random_seed=24)
for item in annot_data:
    parent = item['Pid'] + '_' + item['parent'] + '_' + item['parentPos']
    child = item['Cid'] + '_' + item['child'] + '_' + item['childPos']
    item['data'] = divided[(parent, child)]
Ejemplo n.º 3
0
                     'stThPar', 'stFoPar', 'stFiPar', 'enTwPar', 'enThPar',
                     'enFoPar', 'enFiPar', 'enTwChi', 'enThChi', 'enFoChi',
                     'enFiChi')

# load manually annotated data
rows_list = list()
with open(par.a, mode='r', encoding='utf-8') as f:
    for line in f:
        lab, child, parent = line.rstrip('\n').split('\t')

        child = child.split('_')
        parent = parent.split('_')
        lab = 1 if lab == '+' else 0

        features = make_vector(parent=parent[0],
                               parent_pos=parent[1],
                               child=child[0],
                               child_pos=child[1])
        vector = OrderedDict()
        for key in included_features:
            vector[key] = features[key]
        vector = {**vector, **{'result': lab}}

        rows_list.append(vector)

# load data to predict (if given)
if par.p:
    with open(par.p, mode='r', encoding='utf-8') as f:
        for line in f:
            child, parent = line.rstrip('\n').split('\t')

            child = child.split('_')