def test_align(): cand = ["U.S", ".", "policy"] gold = ["U.S.", "policy"] assert align(cand, gold) == [0, None, 1] cand = ["your", "stuff"] gold = ["you", "r", "stuff"] assert align(cand, gold) == [None, 2] cand = [u'i', u'like', u'2', u'guys', u' ', u'well', u'id', u'just', u'come', u'straight', u'out'] gold = [u'i', u'like', u'2', u'guys', u'well', u'i', u'd', u'just', u'come', u'straight', u'out'] assert align(cand, gold) == [0, 1, 2, 3, None, 4, None, 7, 8, 9, 10]
def align_tokens_and_ud(token_score_tuples, ud_output): result = [] score_tuples_a = [] ud_out_b = [] for tuple in token_score_tuples: score_tuples_a.append(tuple[0].lower()) for row in ud_output: ud_out_b.append(row[1].lower()) alignment = align(score_tuples_a, ud_out_b) cost, a2b, b2a, a2b_multi, b2a_multi = alignment debug_print = False for tuple_index in range(len(token_score_tuples)): if a2b[tuple_index] != -1: result.append((token_score_tuples[tuple_index][0], token_score_tuples[tuple_index][1], ud_output[a2b[tuple_index]][3], ud_output[a2b[tuple_index]][4])) else: debug_print = True # if len(a2b_multi) > 0: # debug_print = True # print('a2b_multi', a2b_multi) # if len(b2a_multi) > 0: # debug_print = True # print('b2a_multi', b2a_multi) # if debug_print: # print('a', score_tuples_a) # print('a2b', a2b) # print('b', ud_out_b) # print('b2a', b2a) return result
def test_line(): line = 'The switches between clarity and intoxication gave me a headache, but at least the silver-haired faery’s explanation of the queens’ “gifts” helped me understand why I could want to wrap my legs around a creature who terrified me.' spacy_doc = nlp(line.lower()) spacy_tokens = [str(token) for token in spacy_doc] spacy_tokens_pos = [token.pos_ for token in spacy_doc] bert_tokens = tokenizer.tokenize(line) diff = align(bert_tokens, spacy_tokens)[0] print('Spacy : {}'.format(spacy_tokens)) print('BERT : {}'.format(bert_tokens))
def get_corr_ind(tok1: List[str], tok2: List[str], tok1_idx: List[List[int]]) -> List[List[int]]: """ Aligns two different tokenizations and outputs the tok2_idx. tok1: tokenized sents via method1 tok2: tokenized sents via method2 tok1_idx: indices of tok1 output: tok2_idx: indices of tok2 based on tok1_idx """ cost, a2b, b2a, a2b_multi, b2a_multi = align(tok1, tok2) # If aligned no pains # can directly return tok1_idx if cost == 0: return tok1_idx # Else create tok2_idx tok2_idx = [] for t1_idx in tok1_idx: t2_idx = [] for t in t1_idx: # If the tok1_idx corresponds # to one single token of tok2 # just use that if a2b[t] != -1: t2_idx.append(a2b[t]) # else use multi outputs else: # hacky implementation # Basically, if the previous word is aligned, # and the next word as well, assign current words # to the difference of the two if t != len(tok1) - 1: if a2b[t-1] != -1 and a2b[t+1] != -1: t2_idx.append( [x for x in range(a2b[t-1] + 1, a2b[t+1])]) elif a2b[t-1] != -1: t2_idx.append( [x for x in range(a2b[t-1]+1, len(tok2))]) else: # Currently seems to work, # set_trace to see when it doesn't work import pdb pdb.set_trace() pass tok2_idx.append(t2_idx) return tok2_idx
def test_file(file_number): filename='books_wiki_en_corpus_'+'training_'+str(file_number)+'.txt' file=os.path.join(base_dir,filename) total_diffs=0 with open(file) as f: text=f.readlines() for line in text: # spacy_doc=nlp(line.lower()) # spacy_tokens=[str(token) for token in spacy_doc] bert_tokens=tokenizer_bert.tokenize(line) doc=Doc(nlp.vocab,words=bert_tokens) spacy_tokens=[t.text for t in doc] diff=align(bert_tokens,spacy_tokens)[0] if diff!=0: print('Difference of '+str(diff)+' positions at line : '+line+'\n') total_diffs+=1 print('Total number of files with differences : {}'.format(total_diffs))
def test_align(tokens_a, tokens_b, expected): cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b) assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected # check symmetry cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a) assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected
def main(args): import csv with open(args.data_path) as f: data = json.load(f) judgement_diffs = [] N = args.num_grams user_kept_pos_ngram_counter = collections.Counter() all_gen_pos_ngram_counter = collections.Counter() pos_textlist_dict = collections.defaultdict(list) gen_len = 0 gen_ent_len = 0 interest_len = 0 interest_ent_len = 0 num_user_interest = 0 num_interested = 0 num_corner_case = 0 num_uniq_gen_entities = 0 num_uniq_interest_entities = 0 num_uniq_gen_predefined_entities = 0 num_uniq_gen_predefined_entities_hard_match = 0 num_uniq_interest_predefined_entities = 0 ent_csvf = open(os.path.join( args.output_dir, f'Entity_extraction_{args.type_interested}.csv'), mode='w') ent_writer = csv.writer(ent_csvf, delimiter='\t') ent_writer.writerow([ 'Generated entities', 'Predefined characters', 'Generated entities that are from the predefined character list', 'Generated entities that are from the predefined character list by hard match' ]) for data_idx, item in enumerate(data): # copy the existing content, a dict of 4 fields new_dict = item story = new_dict['story'] predefined_characters = obtain_predefined_entities(story) generated = item['generated']['description'] finalized = item['finalized']['description'] gen_doc = nlp(generated) gen_spacy_tokens_unprep = [token.text for token in gen_doc] gen_spacy_tokens = process_spacy_tokens_for_align( gen_spacy_tokens_unprep) gen_prep = rouge._preprocess_summary_as_a_whole(generated) gen_prep_zero = gen_prep[0] gen_rouge_tokens = gen_prep_zero.split() cost, s2r, r2s, s2r_multi, r2s_multi = align(gen_spacy_tokens, gen_rouge_tokens) new_dict['diffs'] = get_diff_score(generated, finalized) judgement_diffs.append(new_dict) gen_pos = [token.pos_ for token in gen_doc] all_gen_pos_ngram_counter.update(ngrams(gen_pos, N)) ents = gen_doc.ents gen_ent_ids = [] for ent in ents: ent_ids = list(range(ent.start, ent.end)) gen_ent_ids.extend(ent_ids) gen_len += len(gen_pos) gen_ent_len += len(gen_ent_ids) gen_ent_str_list = [] gen_ent_str_predefined_list = [] gen_ent_str_predefined_list_hard_match = [] for ent in ents: # number of uniq entities ent_lc = ent.text.lower() gen_ent_str_list.append(ent_lc) if ent_lc in predefined_characters: gen_ent_str_predefined_list_hard_match.append(ent_lc) if contained_in(ent_lc, predefined_characters): gen_ent_str_predefined_list.append(ent_lc) print('Generated entities') print(' | '.join(gen_ent_str_list)) print('Predefined characters: ') print(' | '.join(predefined_characters)) print( 'Generated entities that are kept in the predefined character list: ' ) print(' | '.join(gen_ent_str_predefined_list)) print('\n\n') ent_writer.writerow([ ' | '.join(gen_ent_str_list), ' | '.join(predefined_characters), ' | '.join(gen_ent_str_predefined_list), ' | '.join(gen_ent_str_predefined_list_hard_match) ]) gen_ent_str_int_set = set() gen_ent_str_int_prefefined_set = set() for diff_results in new_dict['diffs']: if diff_results['type'] == args.type_interested: num_user_interest += 1 [start_r, end_r] = diff_results['span_hypothesis'] content = diff_results['content'] content_rouge = gen_rouge_tokens[start_r:end_r] if end_r == len( r2s ): # this is a corner case because end_r is not actually the ending index of a span but that + 1 start_s, end_s = r2s[start_r], r2s[end_r - 1] + 1 # bug causing else: start_s, end_s = r2s[start_r], r2s[end_r] # bug causing content_spacy = gen_spacy_tokens[start_s:end_s] if content_spacy == []: # print('handle corner case') # print(content) # print(content_rouge) start_s = end_s - 1 content_spacy = gen_spacy_tokens[start_s:end_s] # print(content_spacy) # print('\n\n') # num_corner_case += 1 # print( ' | '.join(content_spacy)) cur_ngrams_kept = list(ngrams( content_spacy, N)) # a list of n grams of actual text pos_tokens = gen_pos[start_s:end_s] cur_ngrams_pos = list(ngrams(pos_tokens, N)) # a list of n grams of pos user_kept_pos_ngram_counter.update( cur_ngrams_pos) # update the count of POS ngrams assert len(content_spacy) == len(pos_tokens) assert len(cur_ngrams_kept) == len(cur_ngrams_pos) # update the dictionary (str, list): (pos_ngram, list of actual ngrams as examples) for pos_ngram, text_ngram in zip(cur_ngrams_pos, cur_ngrams_kept): pos_ngram_str = ' '.join(pos_ngram) text_ngram_str = ' '.join(text_ngram) pos_textlist_dict[pos_ngram_str].append(text_ngram_str) interest_ids = list(range(start_s, end_s)) entity_interest_ids = [ id for id in interest_ids if id in gen_ent_ids ] interest_ent_len += len(entity_interest_ids) interest_len += end_s - start_s # check whether the generated entities appear in diff for entity in gen_ent_str_list: if entity in ' '.join(content_spacy): gen_ent_str_int_set.add(entity) # check whether an entity in this type of text is present in the set of predefined characters if contained_in(entity, predefined_characters): gen_ent_str_int_prefefined_set.add(entity) print(entity) # else: # # print(entity) num_uniq_gen_entities += len(gen_ent_str_list) num_uniq_interest_entities += len(gen_ent_str_int_set) num_uniq_gen_predefined_entities += len(gen_ent_str_predefined_list) num_uniq_gen_predefined_entities_hard_match += len( gen_ent_str_predefined_list_hard_match) num_uniq_interest_predefined_entities += len( gen_ent_str_int_prefefined_set) # print(gen_ent_str_list) # print(gen_ent_str_int_set) # print() ent_csvf.close() print(f'analyzed {data_idx + 1} user judgements') print(f'Out of {num_interested} interested diff') print(f'there are {num_corner_case} corner cases') # convert a counter to a list of tuple, changing its keys from tuple to str user_kept_pos_ngram_counter_strkey = collections.Counter( dict([(' '.join(k), v) for k, v in user_kept_pos_ngram_counter.items()])) all_gen_pos_ngram_counter_strkey = collections.Counter( dict([(' '.join(k), v) for k, v in all_gen_pos_ngram_counter.items()])) # traverse over the pos ngrams user kept, compute their ratio print('traverse through the pos ngrams user kept, compute their ratio') kept_gen_pos_ngram_ratio_dict = {} for k, v in user_kept_pos_ngram_counter_strkey.most_common(50): kept_gen_pos_ngram_ratio_dict[k] = user_kept_pos_ngram_counter_strkey[ k] / all_gen_pos_ngram_counter_strkey[k] # if all_gen_pos_ngram_counter_strkey[k] ==0: # print('error') items = kept_gen_pos_ngram_ratio_dict.items() sorted_items = sorted(items, key=lambda key_value: key_value[1], reverse=True) import csv # write the pos ngrams analysis results into google spread sheet with open(os.path.join(args.output_dir, f'{N}_grams_pos_{args.type_interested}.csv'), mode='w') as csvf: fieldnames = [ 'pos pattern', 'ratio (NOK / NOG)', 'Number of Occurrences in user-Kept text (NOK)', 'Number of Occurrences in Generated text (NOG)', 'Examples in user-kept text' ] writer = csv.writer(csvf, delimiter='\t') writer.writerow(fieldnames) for idx in range(min([50, len(sorted_items)])): pos, ratio = sorted_items[idx] ck = user_kept_pos_ngram_counter_strkey[pos] cg = all_gen_pos_ngram_counter_strkey[pos] writer.writerow([ pos, ratio, ck, cg, ' || '.join(random.sample(pos_textlist_dict[pos], 5)) ]) print('Saved: ') print( os.path.join(args.output_dir, f'{N}_grams_pos_{args.type_interested}.csv')) print(f'gen len: {gen_len}') print(f'gen ent len: {gen_ent_len}') print(f'{args.type_interested} len: {interest_len}') print(f'{args.type_interested} ent len: {interest_ent_len}') print( f'in generated text the percentage of entity is {gen_ent_len / gen_len}' ) print( f'in {args.type_interested} text the percentage of entity is {interest_ent_len / interest_len}' ) print(f'number of user {args.type_interested} edits: {num_user_interest}') print( f'average user {args.type_interested} edit len: {interest_len / num_user_interest}' ) print(f'Number of entities generated: {num_uniq_gen_entities}') print( f'Number of entities generated that are from predefined character list: {num_uniq_gen_predefined_entities}' ) print( f'Number of entities generated that are from predefined character (by hard match) list: {num_uniq_gen_predefined_entities_hard_match}' ) print( f'Number of entities in {args.type_interested}: {num_uniq_interest_entities}' ) print( f'Number of entities in {args.type_interested} that are from predefined character list:{num_uniq_interest_predefined_entities}' ) print('End of main')
def parse(self, document_name: str, sentences: Iterable[Sentence]) -> Iterator[Sentence]: """Parse visual information embedded in sentence's html_attrs. :param document_name: the document name. :param sentences: sentences to be linked with visual information. :return: A generator of ``Sentence``. """ def attrib_parse( html_attrs: List[str], ) -> Dict[str, Union[List[int], List[str]]]: ret: Dict[str, Union[List[int], List[str]]] = {} for attr in html_attrs: key, values = attr.split( "=", 1) # split only at the first occurence if key in ["left", "top", "right", "bottom", "ppageno"]: ret[key] = [int(x) for x in values.split()] elif key == "tokens": # Run RegEx replacements for (rgx, replace) in self.replacements: values = rgx.sub(replace, values) ret[key] = values.split() return ret for _, group in itertools.groupby(sentences, key=lambda x: x.xpath): sents = list(group) # Get bbox from document attribs = attrib_parse(sents[0].html_attrs) lefts = attribs["left"] tops = attribs["top"] rights = attribs["right"] bottoms = attribs["bottom"] ppagenos = attribs["ppageno"] # Clear the hocr specific html_attrs for sent in sents: for attr in sent.html_attrs[:]: key, values = attr.split( "=", 1) # split only at the first occurence if key in [ "left", "top", "right", "bottom", "ppageno", "tokens", "x_wconf", ]: sent.html_attrs.remove(attr) # Get a list of all tokens represented by ocrx_word in hOCR hocr_tokens = attribs["tokens"] # Get a list of all tokens tokenized by spaCy. spacy_tokens = [word for sent in sents for word in sent.words] # gold.align assumes that both tokenizations add up to the same string. cost, h2s, s2h, h2s_multi, s2h_multi = align( hocr_tokens, spacy_tokens) ptr = 0 # word pointer for sent in sents: sent.left = [] sent.top = [] sent.right = [] sent.bottom = [] sent.page = [] for i, word in enumerate(sent.words): # One-to-one mapping is NOT available if s2h[ptr + i] == -1: if ptr + i in s2h_multi: # One spacy token-to-multi hOCR words left = lefts[s2h_multi[ptr + i]] top = tops[s2h_multi[ptr + i]] right = rights[s2h_multi[ptr + i]] bottom = bottoms[s2h_multi[ptr + i]] ppageno = ppagenos[s2h_multi[ptr + i]] else: h2s_multi_idx = [ k for k, v in h2s_multi.items() if ptr + i == v ] start, end = 0, 0 if h2s_multi_idx: # One hOCR word-to-multi spacy tokens start = h2s_multi_idx[0] end = h2s_multi_idx[-1] + 1 else: start = s2h_multi[i - 1 if i > 0 else 0] end = s2h_multi[i + 1] + 1 # calculate a bbox that can include all left = min(lefts[start:end]) top = min(tops[start:end]) right = max(rights[start:end]) bottom = max(bottoms[start:end]) ppageno = ppagenos[start] # One-to-one mapping is available else: left = lefts[s2h[ptr + i]] top = tops[s2h[ptr + i]] right = rights[s2h[ptr + i]] bottom = bottoms[s2h[ptr + i]] ppageno = ppagenos[s2h[ptr + i]] sent.left.append(left) sent.top.append(top) sent.right.append(right) sent.bottom.append(bottom) sent.page.append(ppageno + 1) # 1-based in Fonduer ptr += len(sent.words) yield sent
def link_spans(): """Match spans with verbs and assign data. The primary challenge is that there is no easy way to link the Spacy-parsed doc data with the word data we have for GBI. Remember that the full verse texts are compiled from the word lists and then parsed as a full verse. The challenge is to use the indexing from before the word lists were compiled to match with the indexing of the Spacy doc object. That then needs to be cross-referenced with the Spacy Matcher object. """ ts.indent(0, reset=True) ts.info('matching spans...') inspect = InspectionDoc() bhsa2eng = collections.defaultdict(dict) for trans, bhsa_nodes in english_verbs.items(): for bhsa_node, para_words in bhsa_nodes.items(): # get GBI-side data verse_ref = id2ref(para_words[0], 'translation') para_text = ' '.join(word_data[trans][w]['text'] for w in para_words) verse_words = verse2words[trans][verse_ref] verse_tokens = [word_data[trans][w]['text'] for w in verse_words] verse_tokens = [t.replace(';','.') for t in verse_tokens] # get Spacy-side data verse_parsing = parsed_verses[trans][verse_ref] spacy_tokens = [str(t) for t in verse_parsing] # map Spacy tokens back to GBI tokens using indicies # Spacy tokenizes words with apostrophes differently (for e.g. `he'll` == `he` + `'ll`) # They can be re-aligned: https://spacy.io/usage/linguistic-features#aligning-tokenization cost, a2b, b2a, a2b_multi, b2a_multi = align(spacy_tokens, verse_tokens) # alignment of indicies here aligner = lambda i: a2b_multi.get(i, a2b[i]) # returns 1-to-1 or many-to-1 aligned index # try to retrieve span links with advanced tense tags verse_sents = list(verse_parsing.sents) spans = verse2spans[trans].get(verse_ref, []) span_match = trans_to_span(para_words, spans, verse_words, aligner) or '' # search for overlapping GBI id sets if span_match: tense_tag = span_match._.tense_tag sentence_i = verse_sents.index(span_match[-1].sent) else: tense_tag = '' sentence_i = None # retrieve basic parsings raw_tokens = [] for i, token in enumerate(verse_parsing): if verse_words[aligner(i)] in para_words: raw_tokens.append(token) vb_tokens = [t for t in raw_tokens if t.tag_.startswith('VB')] # save the data data = { 'eng_ref': verse_ref, 'words': para_text, 'tags': '|'.join(t.tag_ for t in raw_tokens), 'vb_tags': '|'.join(t.tag_ for t in vb_tokens), 'tense': tense_tag, 'tense_span': f'{span_match}', 'sentence_i': sentence_i, } bhsa2eng[trans][bhsa_node] = data # add strings to inspection file if span_match and span_match._.tense_tag: inspect.data[trans][verse_ref] += f'\t\tMATCH: {bhsa_node}|{tense_tag}|{span_match}|{para_text}\n' else: inspect.data[trans][verse_ref] += f'\t\tMISS: {bhsa_node}|''|''|{para_text}\n' ts.info('done with matches') return (bhsa2eng, inspect)
def __init__(self, gold, pred, verbose=False, group=False): """ Align golden and predicted tokens, and their tags. Create dictionaries of falsely predicted tags :param gold: the gold conllu file :param pred: the predicted conlly file :param verbose: if true print information about token numbers :param group: if true, put falsely predicted ufeats labels into a dictionary that contains all the labels it was falsely assigned and the number of times each predicted label was found """ gold = C.load_conll(open(gold, 'r', encoding='utf8')) gold_dic = C.convert_conll(gold) # returns a dictionary with all the column names gold_doc = Document(gold_dic) pred = C.load_conll(open(pred, 'r', encoding='utf8')) pred_dic = C.convert_conll(pred) # returns a dictionary with all the column names pred_doc = Document(pred_dic) # get the tokens self.gold_tokens = [j['text'] for i in gold_dic for j in i] self.pred_tokens = [j['text'] for i in pred_dic for j in i] # get upos tags gold_tags = [j['upos'] for i in gold_dic for j in i] pred_tags = [j['upos'] for i in pred_dic for j in i] # get xpos tags gold_xpos = [j['xpos'] for i in gold_dic for j in i] pred_xpos = [j['xpos'] for i in pred_dic for j in i] # get ufeats tag gold_feats = list() pred_feats = list() for i in gold_dic: for j in i: if 'feats' in j: gold_feats.append(j['feats']) else: gold_feats.append('_') for i in pred_dic: for j in i: if 'feats' in j: pred_feats.append(j['feats']) else: pred_feats.append('_') if verbose: print('Number of gold tokens:', len(self.gold_tokens), ', number of predicted tokens:', len(self.pred_tokens)) # align gold and predicted tokens cost, a2b, b2a, a2b_multi, b2a_multi = align(self.gold_tokens, self.pred_tokens) # align tokens and their POS tags separately self.aligned = list() # tokens self.aligned_pos = list() # upos self.aligned_feats = list() self.aligned_xpos = list() for i in range(len(b2a)): t = (self.gold_tokens[b2a[i]], self.pred_tokens[i]) self.aligned.append(t) p = (gold_tags[b2a[i]], pred_tags[i]) self.aligned_pos.append(p) f = (gold_feats[b2a[i]], pred_feats[i]) self.aligned_feats.append(f) x = (gold_xpos[b2a[i]], pred_xpos[i]) self.aligned_xpos.append(x) # align predicted tags to golden tags, not vice versa as before gold_aligned = list() for i in range(len(a2b)): t = (self.gold_tokens[i], self.pred_tokens[a2b[i]]) gold_aligned.append(t) overall = list() for (a, b) in self.aligned: if a == b: overall.append((a, b)) if verbose: print('Aligned tokens. GOLD:', len(gold_aligned), 'PREDICTED:', len(self.aligned), 'ALIGNED:', len(overall)) self.conf_tags = {} # falsely predicted upos tags self.conf_tags_all = {} # all upos tags self.incorrect_upos = 0 # number of incorrectly predicted upos tags # how many times different tags cooccured in gold and pred files i = 0 for (a, b) in self.aligned_pos: if a != b: self.incorrect_upos += 1 if (a, b) not in self.conf_tags: self.conf_tags[(a, b)] = 1 else: self.conf_tags[(a, b)] += 1 if (a, b) not in self.conf_tags_all: self.conf_tags_all[(a, b)] = 1 else: self.conf_tags_all[(a, b)] += 1 i += 1 self.conf_feats = {} self.conf_feats_all = {} self.incorrect_feats = 0 i = 0 for (a, b) in self.aligned_feats: a = "|".join(sorted(feat for feat in a.split("|") if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) b = "|".join(sorted(feat for feat in b.split("|") if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) if a != b: self.incorrect_feats += 1 # create a dictionary for each falsely predicted ufeats labels and group all its false predictions if group: if a not in self.conf_feats: self.conf_feats[a] = dict() self.conf_feats[a][b] = 1 else: if b not in self.conf_feats[a]: self.conf_feats[a][b] = 1 else: self.conf_feats[a][b] += 1 else: if (a, b) not in self.conf_feats: self.conf_feats[(a, b)] = 1 else: self.conf_feats[(a, b)] += 1 if (a, b) not in self.conf_feats_all: self.conf_feats_all[(a, b)] = 1 else: self.conf_feats_all[(a, b)] += 1 i += 1 self.conf_xpos = {} self.incorrect_xpos = 0 i = 0 for (a, b) in self.aligned_xpos: if a != b: self.incorrect_xpos += 1 if (a, b) not in self.conf_xpos: self.conf_xpos[(a, b)] = 1 else: self.conf_xpos[(a, b)] += 1 i += 1