print('INFO: Loading annotated data.') annot_data = list() relations = list() with open(par.a, mode='r', encoding='U8') as f: for line in f: lab, parent, child = line.rstrip('\n').split('\t') relations.append((parent, child)) cust_feat = dict() if par.r: cust_feat = dict(custom_features[(parent, child)]) child = child.split('_') parent = parent.split('_') lab = True if lab == '+' else False features = make_vector(parent=parent[0], parent_pos=parent[1], child=child[0], child_pos=child[1], custom=cust_feat) annot_data.append({**features, **{'result': lab}}) # split annotated data on train/validation/holdout divided = split_data(relations, train=0.65, validation=0.15, holdout=0.2, random_seed=24) for item in annot_data: parent = item['parent'] + '_' + item['parentPos'] child = item['child'] + '_' + item['childPos'] item['data'] = divided[(parent, child)]
# load data and assign features print('INFO: Loading annotated data.') annot_data = list() relations = list() with open(par.a, mode='r', encoding='U8') as f: for line in f: lab, parent, child = line.rstrip('\n').split('\t') relations.append((parent, child)) child = child.split('_') parent = parent.split('_') lab = True if lab == '+' else False features = make_vector(parent=parent[1], parent_pos=parent[2][0], child=child[1], child_pos=child[2][0], custom={ 'Pid': parent[0], 'Cid': child[0] }) annot_data.append({**features, **{'result': lab}}) # split annotated data on train/validation/holdout divided = split_data(relations, train=0.65, validation=0.15, holdout=0.2, random_seed=24) for item in annot_data: parent = item['Pid'] + '_' + item['parent'] + '_' + item['parentPos'] child = item['Cid'] + '_' + item['child'] + '_' + item['childPos'] item['data'] = divided[(parent, child)]
'stThPar', 'stFoPar', 'stFiPar', 'enTwPar', 'enThPar', 'enFoPar', 'enFiPar', 'enTwChi', 'enThChi', 'enFoChi', 'enFiChi') # load manually annotated data rows_list = list() with open(par.a, mode='r', encoding='utf-8') as f: for line in f: lab, child, parent = line.rstrip('\n').split('\t') child = child.split('_') parent = parent.split('_') lab = 1 if lab == '+' else 0 features = make_vector(parent=parent[0], parent_pos=parent[1], child=child[0], child_pos=child[1]) vector = OrderedDict() for key in included_features: vector[key] = features[key] vector = {**vector, **{'result': lab}} rows_list.append(vector) # load data to predict (if given) if par.p: with open(par.p, mode='r', encoding='utf-8') as f: for line in f: child, parent = line.rstrip('\n').split('\t') child = child.split('_')