def collect_chain(sent, head_token, link): stack = get_children(sent, head_token.attrib['ID'], links=link) chain = [] while stack != []: candidate = stack.pop() chain.append(candidate) stack.extend(get_children(sent, candidate.attrib['ID'], links=link)) return sorted(chain + [head_token], key=lambda x: int(x.attrib['ID']))
def get_fixed_info(sent, head_token): children = get_children(sent, head_token.attrib['ID']) candidate_list = sorted(children + [head_token], key=lambda x: int(x.attrib['ID'])) lemma_list = tuple(item.attrib['LEMMA'] for item in candidate_list) friend_start, friend_end = None, None onetwo_start, onetwo_end = None, None etc_start, etc_end = None, None for i, item in enumerate(lemma_list): # друг PR друг if item == 'друг': if friend_start is None: friend_start = i else: friend_end = friend_end or i + 1 # один PR другой elif item == 'один': onetwo_start = onetwo_start or i elif item == 'другой' and onetwo_start is not None: onetwo_end = i + 1 # и так далее if 'и так далее' in ' '.join(lemma_list): etc_start = lemma_list.index('и') etc_end = etc_start + 3 for trim_start, trim_end in [ (friend_start, friend_end), (onetwo_start, onetwo_end), (etc_start, etc_end), ]: if trim_end is not None: candidate_list = candidate_list[trim_start:trim_end] lemma_list = tuple(lemma_list[trim_start:trim_end]) break else: children = get_children(sent, head_token.attrib['ID'], links='fixed') candidate_list = sorted(children + [head_token], key=lambda x: int(x.attrib['ID'])) lemma_list = tuple(item.attrib['LEMMA'] for item in candidate_list) if ' '.join(lemma_list) == 'точка зрение': link = 'compound' # the only compound elif any(len(lemma) == 2 and lemma.endswith('.') for lemma in lemma_list): link = 'flat:name' # initials else: link = 'fixed' return candidate_list, link
def flatten(sent, head_token, candidate_list, link_to_use): link_to_use = link_to_use + '_already' # new_head new_head = candidate_list[0] new_head.attrib['DOM'] = head_token.attrib['DOM'] if 'LINK' in head_token.attrib: new_head.attrib['LINK'] = head_token.attrib['LINK'] elif 'LINK' in new_head.attrib: del new_head.attrib['LINK'] # repossess all children new_children_ids = set() for item in candidate_list: new_children_ids |= set( int(child.attrib['ID']) - 1 for child in get_children(sent, item.attrib['ID'])) new_children_ids -= set( int(item.attrib['ID']) - 1 for item in candidate_list) for new_child_id in new_children_ids: sent.findall('W')[new_child_id].attrib['DOM'] = new_head.attrib['ID'] # repossess all words that are included in this fixed expression for item in candidate_list[1:]: item.attrib['DOM'] = new_head.attrib['ID'] item.attrib['LINK'] = link_to_use
def munch(ifiles, ofiles): """ Process all files in ifiles list. Output into ofiles list. """ for ifname, ofname in zip(ifiles, ofiles): tree = ET.parse(ifname) root = tree.getroot() for sentence in root[-1].findall( 'S'): # step 1: collect token numbers old:new numbering = {} fantom_number = 0 token_number = 0 for token in sentence.findall('W'): if token.text != 'FANTOM': token_number += 1 fantom_number = 0 numbering[token.attrib['ID']] = str(token_number) else: fantom_number += 1 numbering[token.attrib['ID']] = str( token_number) + '.' + str(fantom_number) for word in sentence.findall('W'): # step 2: assign new numbers word.attrib['ID'] = numbering[word.attrib['ID']] if word.attrib['DOM'] != '_root': word.attrib['DOM'] = numbering[word.attrib['DOM']] for elem in sentence.findall( 'W' ): # step 3: add new atribute for enhanced representation if elem.attrib['DOM'] == '_root': elem.attrib['ENH'] = '0:root' else: elem.attrib[ 'ENH'] = elem.attrib['DOM'] + ':' + elem.attrib['LINK'] for token in sentence.findall('W'): # step 7: fix head ellipsys if token.text == 'FANTOM' and token.attrib['DOM'] == '_root': candidate_children = get_children(sentence, token.attrib['ID']) children = [] fantom_children = [] for child in candidate_children: # real children to the left, # fantom children to the right if child.text == 'FANTOM': fantom_children.append(child) else: children.append(child) guy_to_promote = None # haven't found him yet token.attrib['LINK'] = 'none' if len(children) == 1: guy_to_promote = children[0] new_children = get_children(sentence, children[0].attrib['ID']) children[0].attrib['DOM'] = '_root' children[0].attrib['ENH'] = '0:root' del children[0].attrib['LINK'] token.attrib['DOM'] = children[0].attrib['ID'] if len(new_children) == 1: if new_children[ 0].text != 'FANTOM' and new_children[ 0].attrib['LINK'] != 'parataxis': new_children[0].attrib['LINK'] = 'orphan' elif len(new_children) == 2 and any( n_ch.attrib['LINK'] == 'fixed' for n_ch in new_children): for ch in new_children: if ch.attrib['LINK'] != 'fixed': if ch.text != 'FANTOM' and ch.attrib[ 'LINK'] != 'parataxis': ch.attrib['LINK'] = 'orphan' break else: for ch in new_children: if ch.attrib['LINK'] != 'iobj': if ch.text != 'FANTOM' and ch.attrib[ 'LINK'] != 'parataxis': ch.attrib['LINK'] = 'orphan' break elif len(children) >= 2: if token.attrib['FEAT'].split()[0] in { 'PROPN', 'NOUN', 'PRON', 'SYM' }: if any(child.attrib['LINK'] == 'nsubj' for child in children): for item in children: if item.attrib['LINK'] == 'nsubj': #** guy_to_promote = item item.attrib['DOM'] = '_root' item.attrib['ENH'] = '0:root' del item.attrib['LINK'] token.attrib['DOM'] = item.attrib['ID'] for elem in children: if elem.attrib[ 'ID'] != item.attrib['ID']: elem.attrib[ 'DOM'] = item.attrib['ID'] if elem.text != 'FANTOM' and elem.attrib[ 'LINK'] != 'parataxis': elem.attrib[ 'LINK'] = 'orphan' break elif any(child.attrib['LINK'] in promotion_nominal for child in children): # UD relations priority children.sort(key=lambda x: promotion_nominal. get(x.attrib['LINK'], 100)) if children[0].attrib['LINK'] != children[ 1].attrib['LINK']: #** guy_to_promote = children[0] children[0].attrib['DOM'] = '_root' children[0].attrib['ENH'] = '0:root' del children[0].attrib['LINK'] token.attrib['DOM'] = children[0].attrib[ 'ID'] for elem in children: if elem.attrib['ID'] != children[ 0].attrib['ID']: elem.attrib['DOM'] = children[ 0].attrib['ID'] else: if any(child.attrib['OLD'] in priority for child in children ): # original relations priority children.sort( key=lambda x: priority.get( x.attrib['OLD'], 100)) if children[0].attrib[ 'OLD'] != children[1].attrib[ 'OLD']: #** guy_to_promote = children[0] children[0].attrib['DOM'] = '_root' children[0].attrib[ 'ENH'] = '0:root' del children[0].attrib['LINK'] token.attrib['DOM'] = children[ 0].attrib['ID'] for elem in children: if elem.attrib[ 'ID'] != children[ 0].attrib['ID']: elem.attrib[ 'DOM'] = children[ 0].attrib['ID'] elif any(child.attrib['LINK'] in promotion for child in children): # UD relations priority children.sort(key=lambda x: promotion.get( x.attrib['LINK'], 100)) if children[0].attrib['LINK'] != children[ 1].attrib['LINK']: guy_to_promote = children[0] children[0].attrib['DOM'] = '_root' children[0].attrib['ENH'] = '0:root' del children[0].attrib['LINK'] token.attrib['DOM'] = children[0].attrib['ID'] for elem in children: if elem.attrib['ID'] != children[0].attrib[ 'ID']: elem.attrib['DOM'] = children[ 0].attrib['ID'] if elem.text != 'FANTOM' and elem.attrib[ 'LINK'] != 'parataxis': elem.attrib['LINK'] = 'orphan' else: if any(child.attrib['OLD'] in priority for child in children ): # original relations priority children.sort(key=lambda x: priority.get( x.attrib['OLD'], 100)) #if children[0].attrib['OLD'] != children[1].attrib['OLD']: # we can't distinguish them in any further way, # so we just pick the first one regardless guy_to_promote = children[0] children[0].attrib['DOM'] = '_root' children[0].attrib['ENH'] = '0:root' del children[0].attrib['LINK'] token.attrib['DOM'] = children[0].attrib[ 'ID'] for elem in children: if elem.attrib['ID'] != children[ 0].attrib['ID']: elem.attrib['DOM'] = children[ 0].attrib['ID'] if elem.text != 'FANTOM' and elem.attrib[ 'LINK'] != 'parataxis': elem.attrib['LINK'] = 'orphan' else: # parataxis: 2 examples for elem in children: if elem.attrib['LINK'] == 'parataxis': guy_to_promote = 'parataxis' elem.attrib['DOM'] = '_root' elem.attrib['ENH'] = '0:root' del elem.attrib['LINK'] token.attrib['DOM'] = elem.attrib['ID'] for it in children: if it.attrib['ID'] != elem.attrib['ID']: it.attrib['DOM'] = elem.attrib[ 'ID'] if it.text != 'FANTOM' and it.attrib[ 'LINK'] != 'parataxis': it.attrib['LINK'] = 'orphan' # rehang fantom children onto guy_to_promote for fantom_child in fantom_children: fantom_child.attrib['DOM'] = guy_to_promote.attrib[ 'ID'] break for token in sentence.findall('W'): # step 4: detect orphan deprel if token.text != 'FANTOM': children = get_children(sentence, token.attrib['ID']) if all(child.text != 'FANTOM' for child in children): continue # populate with initial fantoms fantom_list = [ child for child in children if child.text == 'FANTOM' ] fantom_queue = [fantom for fantom in fantom_list] while fantom_queue != []: current_fantom = fantom_queue.pop(0) grand_children = get_children( sentence, current_fantom.attrib['ID']) for ch in grand_children: if ch.text == 'FANTOM': fantom_queue.append(ch) fantom_list.append(ch) # fix unexpected orphans in fantoms for fantom in fantom_list: if fantom.attrib['LINK'] == 'orphan': fantom.attrib['LINK'] = fantom.attrib['ENH'].split( ':', maxsplit=1)[1] for initial_fantom in fantom_list[::-1]: children_list = [ child for child in get_children( sentence, initial_fantom.attrib['ID']) if child.text != 'FANTOM' ] nominal_successful = False fantom_feat = initial_fantom.attrib['FEAT'].split()[0] if fantom_feat in { 'PROPN', 'NOUN', 'PRON', 'SYM', 'ADJ' }: if any(child.attrib['LINK'] == 'nsubj' for child in children_list): for item in children_list: if item.attrib['LINK'] == 'nsubj': item.attrib[ 'LINK'] = initial_fantom.attrib[ 'LINK'] item.attrib[ 'DOM'] = initial_fantom.attrib[ 'DOM'] for elem in children_list: if elem.attrib[ 'ID'] != item.attrib['ID']: elem.attrib[ 'DOM'] = item.attrib['ID'] elem.attrib['LINK'] = 'orphan' break nominal_successful = True else: if len(children_list) == 1: if children_list[0].attrib['LINK'] != 'acl': children_list[0].attrib[ 'LINK'] = initial_fantom.attrib[ 'LINK'] children_list[0].attrib[ 'DOM'] = initial_fantom.attrib['DOM'] nominal_successful = True else: promotion_sorted, priority_sorted, evolution_list = None, None, None if any(child.attrib['LINK'] in promotion_nominal for child in children_list ): # UD relations priority promotion_sorted = sorted( children_list, key=lambda x: promotion_nominal. get(x.attrib['LINK'], 100)) if any(child.attrib['OLD'] in priority_nominal for child in children_list ): # original relations priority priority_sorted = sorted( children_list, key=lambda x: priority_nominal.get( x.attrib['OLD'], 100)) if promotion_sorted is None: evolution_list = priority_sorted elif promotion_sorted[0].attrib[ 'LINK'] == promotion_sorted[ 1].attrib['LINK']: if promotion_sorted[0].attrib[ 'LINK'] == 'amod': evolution_list = promotion_sorted elif priority_sorted is None: evolution_list = promotion_sorted elif priority_sorted[0].attrib[ 'OLD'] != priority_sorted[ 1].attrib['OLD']: evolution_list = priority_sorted else: evolution_list = promotion_sorted else: evolution_list = promotion_sorted if evolution_list is not None: children_list = evolution_list children_list[0].attrib[ 'LINK'] = initial_fantom.attrib[ 'LINK'] children_list[0].attrib[ 'DOM'] = initial_fantom.attrib[ 'DOM'] for elem in children_list: if elem.attrib[ 'ID'] != children_list[ 0].attrib['ID']: elem.attrib[ 'DOM'] = children_list[ 0].attrib['ID'] nominal_successful = True if not nominal_successful: if len(children_list) == 1: children_list[0].attrib[ 'LINK'] = initial_fantom.attrib['LINK'] children_list[0].attrib[ 'DOM'] = initial_fantom.attrib['DOM'] else: promotion_sorted, priority_sorted, evolution_list = None, None, None if any(child.attrib['LINK'] in promotion for child in children_list): # UD relations priority promotion_sorted = sorted( children_list, key=lambda x: promotion.get( x.attrib['LINK'], 100)) if any(child.attrib['OLD'] in priority for child in children_list ): # original relations priority priority_sorted = sorted( children_list, key=lambda x: priority.get( x.attrib['OLD'], 100)) if promotion_sorted is None: evolution_list = priority_sorted elif promotion_sorted[0].attrib[ 'LINK'] == promotion_sorted[1].attrib[ 'LINK']: if priority_sorted is None: evolution_list = promotion_sorted elif priority_sorted[0].attrib[ 'OLD'] != priority_sorted[ 1].attrib['OLD']: evolution_list = priority_sorted else: evolution_list = promotion_sorted else: evolution_list = promotion_sorted if evolution_list is not None: children_list = evolution_list children_list[0].attrib[ 'LINK'] = initial_fantom.attrib['LINK'] children_list[0].attrib[ 'DOM'] = initial_fantom.attrib['DOM'] for elem in children_list: if elem.attrib['ID'] != children_list[ 0].attrib['ID']: elem.attrib['DOM'] = children_list[ 0].attrib['ID'] if elem.attrib['LINK'] not in { 'cc', 'mark', 'parataxis', 'conj' }: elem.attrib['LINK'] = 'orphan' else: if any(child.attrib['LINK'] == 'discourse' and child.attrib['LEMMA'] == 'нет' for child in children_list): for elem in children_list: if elem.attrib[ 'LINK'] == 'discourse' and elem.attrib[ 'LEMMA'] == 'нет': elem.attrib[ 'LINK'] = initial_fantom.attrib[ 'LINK'] elem.attrib[ 'DOM'] = initial_fantom.attrib[ 'DOM'] for item in children_list: if item.attrib[ 'ID'] != elem.attrib[ 'ID']: item.attrib[ 'DOM'] = elem.attrib[ 'ID'] if item.attrib[ 'LINK'] not in { 'cc', 'mark', 'parataxis', 'conj' }: item.attrib[ 'LINK'] = 'orphan' elif any(child.attrib['LINK'] == 'advcl' for child in children_list): for elem in children_list: if elem.attrib['LINK'] == 'advcl': elem.attrib[ 'LINK'] = initial_fantom.attrib[ 'LINK'] elem.attrib[ 'DOM'] = initial_fantom.attrib[ 'DOM'] for item in children_list: if item.attrib[ 'ID'] != elem.attrib[ 'ID']: item.attrib[ 'DOM'] = elem.attrib[ 'ID'] if item.attrib[ 'LINK'] not in { 'cc', 'mark', 'parataxis', 'conj' }: item.attrib[ 'LINK'] = 'orphan' elif any( child.attrib['LINK'] == 'discourse' for child in children_list): for elem in children_list: if elem.attrib[ 'LINK'] == 'discourse': elem.attrib[ 'LINK'] = initial_fantom.attrib[ 'LINK'] elem.attrib[ 'DOM'] = initial_fantom.attrib[ 'DOM'] for item in children_list: if item.attrib[ 'ID'] != elem.attrib[ 'ID']: item.attrib[ 'DOM'] = elem.attrib[ 'ID'] if item.attrib[ 'LINK'] not in { 'cc', 'mark', 'parataxis', 'conj' }: item.attrib[ 'LINK'] = 'orphan' for token in sentence.findall('W'): # step 5: delete 'cop' fantom tokens (preparations) # well, looks like not only 'cop', but all fantoms # that are leaves and have another fantom as a head change_number = {} if token.text == 'FANTOM': children = get_enh_children(sentence, token.attrib['ID']) if children == []: token.attrib['DEL'] = 'YES' current_fantom = round(float(token.attrib['ID']), 1) start_token = round( float(token.attrib['ID'].split('.')[0]), 1) end_token = start_token + 1 for elem in sentence.findall('W'): if start_token < round(float(elem.attrib['ID']), 1) < end_token: if round(float(elem.attrib['ID']), 1) > current_fantom: change_number[elem.attrib['ID']] = str( round( float(elem.attrib['ID']) - 0.1, 1)) if change_number != {}: for fantom in sentence.findall('W'): if fantom.attrib['ID'] in change_number: fantom.attrib['ID'] = change_number[ fantom.attrib['ID']] if fantom.attrib[ 'DOM'] != '_root' and fantom.attrib[ 'DOM'] in change_number: fantom.attrib['DOM'] = change_number[ fantom.attrib['DOM']] enh_no = fantom.attrib['ENH'].split(':')[0] if enh_no in change_number: fantom.attrib['ENH'] = fantom.attrib[ 'ENH'].replace(enh_no, change_number[enh_no]) for token in sentence.findall( 'W'): # step 6: delete 'cop' fantom tokens (deletion) if token.text == 'FANTOM' and token.attrib.get( 'DEL', 'EMPTY' ) == 'YES': # and token.attrib.get('LINK', 'EMPTY') == 'cop': sentence.remove(token) for token in sentence.findall('W'): # fix orphan + CCONJ 29.11.17 if token.attrib.get('FEAT', 'EMPTY').split()[0] in { 'CCONJ', 'SCONJ' } and token.attrib.get('LINK', 'EMPTY') == 'orphan': if token.attrib['ENH'].split(':')[1] == 'orphan': if token.attrib['LEMMA'] == 'чтобы': token.attrib['LINK'] = 'mark' else: token.attrib['LINK'] = 'cc' else: token.attrib['LINK'] = token.attrib['ENH'].split( ':')[1] # Something went wrong for token in sentence.findall('W'): if token.text != 'FANTOM' and '.' in token.get('DOM', ''): print('-' * 20) print(ifname) print('-' * 20) print(token.attrib['DOM']) print(*[ ch.attrib['LINK'] for ch in get_children(sentence, token.attrib['DOM']) ]) print(*[ ch.attrib['OLD'] for ch in get_children(sentence, token.attrib['DOM']) ]) print('-' * 20) for item in sentence.findall('W'): print(item.text, item.attrib['LEMMA'], item.attrib['FEAT'].split()[0], item.attrib['ID'], item.attrib['DOM'], item.attrib.get('LINK', ''), item.attrib['ENH']) print('=' * 20) tree.write(ofname, encoding="UTF-8") return
def check_citation(sentence, symbol, i, token_id, file_name, start): sentence_element = sentence sentence = sentence.findall('W') if start: for tok in sentence[:i + 1]: root_token = [ token for token in sentence if token.attrib['DOM'] == '_root' and '.' not in token.attrib['ID'] ] if tok.attrib['DOM'] == '_root': if all( ch.attrib.get('LINK', 'EMPTY') != 'parataxis' for ch in get_children(sentence_element, tok.attrib['ID'])): for j, new_tok in enumerate(sentence[i + 1:]): if ',-' in new_tok.tail.strip().replace( ' ', '').replace( '\n', '') or ',"' in new_tok.tail.strip( ).replace(' ', '').replace('\n', ''): for t in sentence[i + 1:i + j + 2]: if float(t.attrib['DOM']) < float(sentence[i+1].attrib['ID']) or float(t.attrib['DOM']) > float(sentence[i+j+2].attrib['ID']) and \ t.text != 'FANTOM': t.attrib['LINK'] = 'parataxis' t.attrib['DOM'] = root_token[0].attrib[ 'ID'] for t in sentence[i + j + 2:]: if float(t.attrib['DOM']) < float( sentence[i + j + 2].attrib['ID'] ) and t.text != 'FANTOM' and t.attrib[ 'LINK'] in ['orphan', 'conj']: head_token = [ token.attrib for token in sentence if token.attrib['ID'] == t.attrib['DOM'] ] if head_token[0].get('LINK', 'EMPTY') != 'conj': t.attrib['LINK'] = 'conj' t.attrib['DOM'] = root_token[0].attrib[ 'ID'] break else: local_list = [] for t in sentence[i + 1:]: if float(t.attrib['DOM']) <= float(token_id): local_list.append(t) if len(local_list) > 0: local_list[0].attrib['LINK'] = 'parataxis' local_list[0].attrib['DOM'] = root_token[0].attrib[ 'ID'] break else: for j, t in enumerate(sentence[i + 1:]): if (citation_punct_re.search(t.tail) is not None and end_of_citation_re.search(t.tail) is not None and whitespace_re.sub('', t.tail) != '",'): if any(token.attrib['DOM'] == '_root' for token in sentence[i + j + 2:]): break else: if not any(',-' in t.tail.strip().replace(' ', '').replace('\n', '') or ',"' in t.tail.strip().replace(' ', '').replace('\n', '') \ for t in sentence[i+1:]): candidates = [ t for t in sentence[:i + 1] if t.attrib['DOM'] == root_token[0].attrib['ID'] ] if len(candidates) == 1: candidates[0].attrib['DOM'] = '_root' del candidates[0].attrib['LINK'] root_token[0].attrib['DOM'] = candidates[0].attrib[ 'ID'] root_token[0].attrib['LINK'] = 'parataxis' else: #exceptions if sentence[0].text == 'Николай': print('\nException:') print(' '.join((file_name.split('/')[-1], sentence_element.attrib['ID']))) print_sentence(sentence_element) sentence[0].attrib['LINK'] = 'vocative' sentence[1].attrib['LINK'] = 'flat:name' print('\nCorrected to:') print_sentence(sentence_element) elif sentence[0].text == 'Быстро': print('\nException:') print(' '.join((file_name.split('/')[-1], sentence_element.attrib['ID']))) print_sentence(sentence_element) sentence[3].attrib['DOM'] = '_root' del sentence[3].attrib['LINK'] sentence[4].attrib['DOM'] = '3' sentence[4].attrib['LINK'] = 'parataxis' print('\nCorrected to:') print_sentence(sentence_element) else: #print(i, token_id, sentence[0].text, sentence[1].text) #print('+' * 20) #print(*[(token.attrib['ID'], token.text, token.attrib['DOM'], token.attrib.get('LINK', 'EMPTY'), token.tail) for token in sentence], sep='\n') #print('*' * 20) pass else: if any(token.attrib['DOM'] == '_root' for token in sentence[:i + 1]): candidates = [ t for t in sentence[i + 1:] if t.text != 'FANTOM' and '.' not in t.attrib['DOM'] and int(t.attrib['DOM']) <= i + 1 ] if len(candidates ) == 1 and candidates[0].attrib['LINK'] != 'parataxis': candidates[0].attrib['LINK'] = 'parataxis' elif len(candidates ) == 1 and candidates[0].attrib['LINK'] == 'parataxis': pass elif len(candidates) > 1: for j, t in enumerate(sentence[i + 1:]): if ',"' in t.tail.strip().replace(' ', '').replace('\n', '') or \ '!"' in t.tail.strip().replace(' ', '').replace('\n', '') or \ '?"' in t.tail.strip().replace(' ', '').replace('\n', ''): candidate = [ t for t in sentence[i + 1:i + j + 2] if t.text != 'FANTOM' and '.' not in t.attrib['DOM'] and int(t.attrib['DOM']) <= i + 1 ] if len(candidate) == 1: candidate[0].attrib['LINK'] = 'parataxis' elif len(candidate) == 2: if any(can.text == 'то' and can.attrib['LINK'] == 'mark' for can in candidate): try: sentence[23].attrib['DOM'] = '28' except IndexError: pass elif any(can.text == 'Париже' and can.attrib['LINK'] == 'conj' for can in candidate): print('\nException:') print(' '.join( (file_name.split('/')[-1], sentence_element.attrib['ID']))) print_sentence(sentence_element) sentence[9].attrib['LINK'] = 'parataxis' sentence[7].attrib['DOM'] = '10' sentence[7].attrib['LINK'] = 'discourse' print('\nCorrected to:') print_sentence(sentence_element) elif any(can.text == 'смотришь' and can.attrib['LINK'] == 'conj' for can in candidate): print('\nException:') print(' '.join( (file_name.split('/')[-1], sentence_element.attrib['ID']))) print_sentence(sentence_element) sentence[12].attrib['LINK'] = 'parataxis' sentence[25].attrib['DOM'] = '13' print('\nCorrected to:') print_sentence(sentence_element) else: if any(can.text == 'Aravot' for can in candidate): for el in candidate: if el.attrib['LINK'] == 'flat:foreign': el.attrib['DOM'] = '5' else: for j, t in enumerate(sentence[i + 1:]): if ',"' in t.tail.strip().replace(' ', '').replace('\n', '') or \ '!"' in t.tail.strip().replace(' ', '').replace('\n', '') or \ '?"' in t.tail.strip().replace(' ', '').replace('\n', '') or \ '".' in t.tail.strip().replace(' ', '').replace('\n', ''): if any(token.attrib['DOM'] == '_root' for token in sentence[i + 1:i + j + 2]): root_token = [ token for token in sentence if token.attrib['DOM'] == '_root' and '.' not in token.attrib['ID'] ] children = get_children(sentence_element, root_token[0].attrib['ID']) if any(ch.attrib['LINK'] in {'parataxis', 'cc'} for ch in children): cand = [ c for c in children if c.attrib['LINK'] in {'parataxis', 'cc'} ] if len(cand) == 1: root_token[0].attrib['DOM'] = cand[0].attrib[ 'ID'] root_token[0].attrib['LINK'] = 'parataxis' cand[0].attrib['DOM'] = '_root' cand[0].attrib.pop('LINK') else: pass else: pass
def process_puzzle_meta(pid, overwrite=False, snapshot_threads=15): metafile = "data/puzzle_solutions/solution_{}/{}_meta.h5".format(pid, pid) tmscore_file = "data/puzzle_solutions/solution_{}/{}_tmscore.csv".format( pid, pid) soln_csv_file = "data/puzzle_solutions/solution_{}/{}_soln.csv".format( pid, pid) hist_csv_file = "data/puzzle_solutions/solution_{}/{}_hist.csv".format( pid, pid) if not os.path.exists(metafile) or overwrite: # tmscore_lookup = {} # if os.path.exists(tmscore_file): # with open(tmscore_file) as fp: # print(pid, "loading tmscores") # tmscore_in = csv.DictReader(fp, fieldnames=['sid_a', 'sid_b', 'tmscore']) # tmscore_lookup = {(r['sid_a'], r['sid_b']): float(r['tmscore']) for r in tmscore_in} # def get_tmscore(key): # return tmscore_lookup.get(key, np.nan) soln_lookup = {} nid_to_sid = {} history = {} # with open("data/puzzle_solutions/solution_{}/{}_soln.pickle".format(pid, pid), 'rb') as fp: # solns_clean = pickle.load(fp) # soln_lookup = {get_nid(s): s for s in solns_clean} # solvers = {k: sorted(g, key=lambda x: int(x['timestamp'])) for k, g in # groupby(sorted(solns_clean, key=lambda s: s['uid']), lambda s: s['uid'])} # with open("data/puzzle_solutions/solution_{}/{}_hist.pickle".format(pid, pid), 'rb') as fp: # history = pickle.load(fp) if not os.path.exists(soln_csv_file): print(pid, "fetching soln csv") sys.stdout.flush() if not os.path.exists( "data/puzzle_solutions/solution_{}".format(pid)): os.makedirs("data/puzzle_solutions/solution_{}".format(pid)) subprocess.run([ "scp", "wannacut:~/foldit/{}".format(soln_csv_file), soln_csv_file ], stdout=subprocess.DEVNULL) with open("data/puzzle_solutions/solution_{}/{}_soln.csv".format( pid, pid)) as fp: print(pid, "processing", soln_csv_file) sys.stdout.flush() soln_in = csv.DictReader(fp, lineterminator='\n') for r in soln_in: r['pdl'] = json.loads(r['pdl']) r['guide_used'] = False for p in r['pdl']: try: # a pdl entries have a different header structure and were parsed incorrectly p['header']['score'] = float(p['header']['score']) except ValueError: continue if p['header']['score'] == 9999.99: r['guide_used'] = True r['energy'] = float(r['energy']) r['timestamp'] = int(r['timestamp']) r['atoms'] = get_atoms(r) r.pop('ca') r['energies'] = [ EnergyComponent(*e) for e in json.loads(r['energies']) ] if r['energies'] else None if len(r['pdl']) > 0 and (r['uuid'], int( r['count'])) != ROOT_NID: #and not all(sum(p['actions'].values()) == 0 for p in r['pdl']): soln_lookup.setdefault((r['uuid'], int(r['count'])), []).append(r) solns_pre = [] for nid, ss in soln_lookup.items(): s = min(ss, key=lambda x: x['energy']) soln_lookup[nid] = s nid_to_sid[nid] = s['sid'] if len(s['pdl']) > 0: solns_pre.append(s) protein_size, _ = Counter([len(s['atoms']) for s in solns_pre]).most_common(1)[0] solns_clean = [ s for s in solns_pre if len(s['atoms']) == protein_size ] solvers = { k: sorted(g, key=lambda x: int(x['timestamp'])) for k, g in groupby( sorted(solns_clean, key=lambda s: s['uid']), lambda s: s['uid']) } if not os.path.exists(hist_csv_file): print(pid, "fetching hist csv") sys.stdout.flush() subprocess.run([ "scp", "wannacut:~/foldit/{}".format(hist_csv_file), hist_csv_file ], stdout=subprocess.DEVNULL) with open("data/puzzle_solutions/solution_{}/{}_hist.csv".format( pid, pid)) as fp: print(pid, "processing", hist_csv_file) hist_in = csv.DictReader(fp, fieldnames=[ "pid", "uuid", "count", "parent_uuid", "parent_count" ]) for r in hist_in: key = (r['parent_uuid'], int(r['parent_count'])) r['count'] = int(r['count']) r['parent_count'] = int(r['parent_count']) history.setdefault(key, []).append(r) parents = {} children = get_children(ROOT_NID, history) children = [(ROOT_NID, c) for c in children] while len(children) > 0: for p, c in children: assert c not in parents parents[c] = p children = [(c, nc) for p, c in children for nc in get_children(c, history)] logging.debug("{} generating lookups".format(pid)) parent_lookup = {} for k in soln_lookup: parent = parents[k] while parent not in soln_lookup and parent != ROOT_NID: parent = parents[parent] assert parent in soln_lookup or parent == ROOT_NID parent_lookup[k] = parent child_lookup = { parent: [c for p, c in g] for parent, g in groupby( sorted([(p, c) for c, p in parent_lookup.items()]), lambda x: x[0]) } descendants_memo = {} def get_descendants(nid): if nid in descendants_memo: return descendants_memo[nid] # soln_lookup is generated from the list of solutions passed in which are all from a single user # the history may include evolver children, which we have to avoid trying to look up children = [ c for c in child_lookup[nid] if c in soln_lookup or any(x in soln_lookup for x in get_descendants(c)) ] if nid in child_lookup else [] descendants_memo[nid] = children + [ d for c in children for d in get_descendants(c) ] return descendants_memo[nid] logging.debug("{} correcting timestamps".format(pid)) bases = [ get_nid(s) for s in soln_lookup.values() if parent_lookup[get_nid(s)] == ROOT_NID ] while len(bases) > 0: nid = bases.pop(0) if nid in child_lookup: if nid in soln_lookup: cur = soln_lookup[nid] descendants = [ soln_lookup[x] for x in get_descendants(nid) if x in soln_lookup ] if cur['timestamp'] > min(c['timestamp'] for c in descendants): grandparent = {'timestamp': 0} if parent_lookup[nid] in soln_lookup: grandparent = soln_lookup[parent_lookup[nid]] assert grandparent['timestamp'] <= min( c['timestamp'] for c in descendants) cur['timestamp'] = max( min(c['timestamp'] for c in descendants) - 300, grandparent['timestamp'] + 1) bases.extend([c for c in child_lookup[nid]]) delta = 3600 print(pid, "computing soln metrics") sys.stdout.flush() param_ranges = { "energy_threshold_frac": [0.25, 0.5, 0.75], "rate_threshold": [-0.001, -0.01], "diff_threshold": [-1, -10, -25], "tm_threshold": [0.5, 0.9, 1] } breakthrough_params = [ dict(d) for d in product(*[[(k, v) for v in vs] for k, vs in param_ranges.items()]) ] logging.debug( "{} passing parent_lookup, size {} and child_lookup, size {} to threads" .format(pid, sys.getsizeof(parent_lookup), sys.getsizeof(child_lookup))) with Pool(snapshot_threads) as snapshot_pool: acc = snapshot_pool.map_async( partial( process_snapshots, delta=delta, breakthrough_params=breakthrough_params, parent_lookup=parent_lookup, child_lookup=child_lookup, # nid_to_sid=nid_to_sid, get_tmscore_pkl=dill.dumps(get_tmscore)), nid_to_sid=nid_to_sid), sorted(solvers.values(), key=len, reverse=True), chunksize=1).get() df = pd.DataFrame(data=[d for d, _ in acc if d is not None]) # breakthroughs = pd.concat([b for _, b in acc if b is not None]) breakthroughs = pd.DataFrame() print(pid, 'metrics computed') sys.stdout.flush() best = df[df.frontier_pdbs.notnull()].frontier_pdbs.apply( lambda x: x[-1]) # logging.debug("{} puzzle frontier tmscores".format(pid)) # atoms_lookup = {s['sid']: s['atoms'] for s in best} # best_pairs = list(combinations([s['sid'] for s in best], 2)) # for k, v in tmscore([c for c in best_pairs if c not in tmscore_lookup], # "tmp_data/{}_best".format(pid), atoms_lookup): # tmscore_lookup[k] = v # best_tmscores = {c: tmscore_lookup[c] if c in tmscore_lookup else np.nan for c in best_pairs} en_lookup = {} for _, z in df.apply(lambda r: zip(r['timestamps'], r['energies']), axis=1).iteritems(): for t, e in z: if t not in en_lookup: en_lookup[t] = [] en_lookup[t].append(e) pfront = np.minimum.accumulate( [min(es) for t, es in sorted(en_lookup.items())]) upload_baseline = max( stats.mode( np.concatenate( df.upload_rate[df.upload_rate.notnull()].values)).mode) df = df.assign(upload_ratio=df.upload_rate / upload_baseline) # it appears there's a clustering of energies for solutions that have only one or two actions (usually repack), so we'll use that as the energy baseline energy_baseline = scipy.stats.mode( df[df.first_pdb.notnull() & df.first_pdb.apply(lambda p: p and sum( p['pdl'][0]['actions'].values()) < 3)].first_pdb.apply( lambda p: round(p['energy']))).mode.min() print(pid, "getting structure") struct_file = "data/puzzle_solutions/solution_{}/{:010}.ir_puzzle.pdb".format( pid, int(pid)) # setup_file = "data/puzzle_solutions/solution_{}/{:010}.ir_puzzle.puzzle_setup".format(pid, int(pid)) if not os.path.exists(struct_file): subprocess.run([ "scp", "wannacut:~/foldit/{}".format(struct_file), struct_file ], stdout=subprocess.DEVNULL) # subprocess.run(["scp", "wannacut:~/foldit/{}".format(setup_file), setup_file], stdout=subprocess.DEVNULL) with open(struct_file) as init_pdb: content = init_pdb.read() sec_struct = { i: l for i, l in [ x.split()[:2] for x in re.findall(r'^(?!ATOM)\s+?\d+.*', content, re.MULTILINE) ] } assert all(v in ['H', 'E', 'L', 'C'] for v in sec_struct.values()) atoms = Counter([ x.split()[5] for x in re.findall('^ATOM.*', content, re.MULTILINE) ]) structure = { 'loop': [ atoms[i] for i, l in sec_struct.items() if l == 'C' or l == 'L' ], 'helix': [atoms[i] for i, l in sec_struct.items() if l == 'H'], 'sheet': [atoms[i] for i, l in sec_struct.items() if l == 'E'] } # meta = PuzzleMeta(pid, best_tmscores, pfront, upload_baseline, energy_baseline, structure) meta = PuzzleMeta(pid, None, pfront, upload_baseline, energy_baseline, structure) print(pid, 'puzzle metrics computed') sys.stdout.flush() print(pid, "writing soln output") sys.stdout.flush() if os.path.exists(metafile) and overwrite: logging.debug("{} deleting existing meta file".format(pid)) subprocess.run(['rm', metafile ]) # remove to avoid ever accumulating data files store = pd.HDFStore(metafile) store["df"] = df store["bts"] = breakthroughs store["puz"] = pd.Series( [meta]) # must be wrapped in a pandas data structure store.close() subprocess.run(["rm", soln_csv_file]) print(pid, "done") else: print(metafile, "exists, will not overwrite")
def munch(ifiles, ofiles): """ Process all files in ifiles list. Output into ofiles list. """ for ifname, ofname in zip(ifiles, ofiles): tree = ET.parse(ifname) root = tree.getroot() for sentence in root[-1].findall('S'): for token in sentence.findall( 'W'): # step 0: detect and re-annotate 'не' if token.attrib['LEMMA'] == 'не' and 'VERB' in token.attrib[ 'FEAT']: link, pos, feats, head_token, head_pos, head_feats, head_root = get_info( token, sentence) children = get_children(sentence, token.attrib['ID']) if token.text != 'FANTOM' and all(ch.text != 'FANTOM' for ch in children): for elem in children: if 'VerbForm=Inf' in elem.attrib['FEAT']: gr_children = get_children( sentence, elem.attrib['ID']) break for item in gr_children: if item.attrib['LEMMA'] in suspicious: gr_gr_children = get_children( sentence, item.attrib['ID']) if all(gr_gr.attrib['FEAT'].split()[0] != 'ADP' for gr_gr in gr_gr_children): token.attrib['LEMMA'] = suspicious[ item.attrib['LEMMA']] token.text = token.text + item.text token.attrib['FEAT'] = item.attrib['FEAT'] item.attrib['DEL'] = 'YES' break elif token.text != 'FANTOM' and any(ch.text == 'FANTOM' for ch in children): for elem in children: if 'VerbForm=Inf' in elem.attrib['FEAT']: gr_children = get_children( sentence, elem.attrib['ID']) break for item in gr_children: if item.attrib['LEMMA'] in suspicious: gr_gr_children = get_children( sentence, item.attrib['ID']) if all(gr_gr.attrib['FEAT'].split()[0] != 'ADP' for gr_gr in gr_gr_children): token.attrib['LEMMA'] = suspicious[ item.attrib['LEMMA']] token.attrib['FEAT'] = item.attrib['FEAT'] item.attrib['DEL'] = 'YES' break elif token.text == 'FANTOM' and children == []: if sentence.attrib['ID'] == '217': for elem in sentence.findall('W'): if elem.attrib['ID'] == '11': elem.attrib['DEL'] = 'YES' if elem.attrib['ID'] == '12': elem.attrib['LEMMA'] = 'нечего' if sentence.attrib['ID'] == '94': for elem in sentence.findall('W'): if elem.attrib['ID'] == '11': elem.attrib['DEL'] = 'YES' if sentence.attrib['ID'] == '169': for elem in sentence.findall('W'): if elem.attrib['ID'] == '6': elem.attrib['DOM'] = '14' if elem.attrib['ID'] == '9': elem.attrib['DEL'] = 'YES' if elem.attrib['ID'] == '10': elem.attrib['LEMMA'] = 'некого' if elem.attrib['ID'] == '11': elem.attrib['DOM'] = '13' if elem.attrib['ID'] == '12': elem.attrib['DEL'] = 'YES' if elem.attrib['ID'] == '13': elem.attrib['LEMMA'] = 'негде' elem.attrib['DOM'] = '10' elif token.text == 'FANTOM' and any(ch.text == 'FANTOM' for ch in children): for elem in sentence.findall('W'): if elem.attrib['ID'] == '11': elem.attrib['DEL'] = 'YES' if elem.attrib['ID'] == '2': elem.attrib['LEMMA'] = suspicious[ elem.attrib['LEMMA']] elem.attrib['DOM'] = '_root' del elem.attrib['LINK'] if elem.attrib['DOM'] == '1': elem.attrib['DOM'] == '2' elif token.text == 'FANTOM' and all(ch.text != 'FANTOM' for ch in children): if all('VerbForm=Inf' not in ch.attrib['FEAT'] for ch in children): if sentence.attrib['ID'] == '440': for elem in sentence.findall('W'): if elem.attrib['ID'] == '16': elem.attrib['DOM'] = '18' if elem.attrib['ID'] == '17': elem.attrib['DEL'] = 'YES' if elem.attrib['ID'] == '18': elem.attrib['LEMMA'] = suspicious[ elem.attrib['LEMMA']] for elem in children: if 'VerbForm=Inf' in elem.attrib['FEAT']: gr_children = get_children( sentence, elem.attrib['ID']) if head_token is None: for item in gr_children: if item.attrib['LEMMA'] in suspicious: gr_gr_children = get_children( sentence, item.attrib['ID']) if all(gr_gr.attrib['FEAT'].split( )[0] != 'ADP' for gr_gr in gr_gr_children): item.attrib[ 'LEMMA'] = suspicious[ item.attrib['LEMMA']] item.attrib['DOM'] = '_root' del item.attrib['LINK'] token.attrib['DEL'] = 'YES' for renum in sentence.findall( 'W'): if renum.attrib[ 'DOM'] == token.attrib[ 'ID']: renum.attrib[ 'DOM'] = item.attrib[ 'ID'] break else: for broken in children: if broken.attrib[ 'LEMMA'] in suspicious: broken.attrib[ 'LEMMA'] = suspicious[ broken.attrib['LEMMA']] broken.attrib['DOM'] = '_root' del broken.attrib['LINK'] token.attrib['DEL'] = 'YES' for renum in sentence.findall( 'W'): if renum.attrib[ 'DOM'] == token.attrib[ 'ID']: renum.attrib[ 'DOM'] = broken.attrib[ 'ID'] else: for item in gr_children: if item.attrib['LEMMA'] in suspicious: gr_gr_children = get_children( sentence, item.attrib['ID']) if all(gr_gr.attrib['FEAT'].split( )[0] != 'ADP' for gr_gr in gr_gr_children): token.attrib[ 'LEMMA'] = suspicious[ item.attrib['LEMMA']] token.attrib[ 'FEAT'] = item.attrib[ 'FEAT'] token.text = item.text item.attrib['DEL'] = "YES" for renum in sentence.findall( 'W'): if renum.attrib[ 'DOM'] == item.attrib[ 'ID']: renum.attrib[ 'DOM'] = token.attrib[ 'ID'] else: pass for sentence in root[-1].findall( 'S'): # step 2: collect token numbers old:new numbering = {} token_number = 0 for token in sentence.findall('W'): if 'DEL' not in token.attrib: token_number += 1 numbering[token.attrib['ID']] = str(token_number) for word in sentence.findall('W'): # step 3: assign new numbers word.attrib['ID'] = numbering[word.attrib['ID']] if word.attrib['DOM'] != '_root': word.attrib['DOM'] = numbering[word.attrib['DOM']] for elem in sentence.findall('W'): # step 4: remove tokens if 'DEL' in elem.attrib: sentence.remove(elem) for sentence in root[-1].findall('S'): for token in sentence.findall('W'): # Mood=Cnd fix if token.attrib['LEMMA'] in {'бы', 'б', 'чтобы', 'чтоб'}: link, pos, feats, head_token, head_pos, head_feats, head_root = get_info( token, sentence) try: if head_token.attrib['LEMMA'] not in forbidden_head: if pos in {'SCONJ', 'PART'}: token.attrib['FEAT'] = token.attrib[ 'FEAT'] + ' Mood=Cnd' else: token.attrib['FEAT'] = token.attrib[ 'FEAT'].replace(' Foreign=Yes', '') except: print('Something went wrong') print(*[(elem.text, elem.tail.rstrip('\n'), elem.attrib) for elem in sentence], sep='\n') print() tree.write(ofname, encoding="UTF-8") return