def afterTurbo(data, allorthree): sents = [sent for sent in parse_incr(data)] predicted = open("tmp/predicted.conll", 'r') predsents = [sent for sent in parse_incr(predicted)] sentidx3nouns = [i for i in range(len(sents)) if has3Nouns(sents[i])] #avg = sum([len(sents[i]) for i in sentidx3nouns])/len(sentidx3nouns) #print(sum([len(sent) for sent in sents])/len(sents)) #print(avg) #sentidx3nouns = [i for i in range(len(sents)) if len(sents[i])>=17] if allorthree: sents = [sents[i] for i in sentidx3nouns] predsents = [predsents[i] for i in sentidx3nouns] #print(sum([len(sent) for sent in sents])/len(sents)) if len(sents) != len(predsents): print("number of sentences is different from predicted sentences") toteq = 0 totneq = 0 for (tlist1, tlist2) in zip(sents, predsents): eq, neq = compareTlist(tlist1, tlist2) toteq += eq totneq += neq file = open("tmp/testresults.txt", 'w') file.write(str(toteq) + "\n") file.write(str(totneq)) print(toteq / (totneq + toteq))
def get_word_parsing(dev_path=None, test_path=None, train_path=None): # 新版本ud 但fac里标记的旧版 if dev_path is None: dev_path = "../unified/uds/UD-EWT/en_ewt-ud-dev.conllu" if test_path is None: test_path = "../unified/uds/UD-EWT/en_ewt-ud-test.conllu" if train_path is None: train_path = "../unified/uds/UD-EWT/en_ewt-ud-train.conllu" print("-----------OpenUdConllu------------") dev = open(dev_path, "r", encoding="utf-8") test = open(test_path, "r", encoding="utf-8") train = open(train_path, "r", encoding="utf-8") dev_data = [] test_data = [] train_data = [] print("-----------OpenUd---dev---------") for tokenlist in parse_incr(dev): dev_data.append(tokenlist) print("-----------OpenUd---test---------") for tokenlist in parse_incr(test): test_data.append(tokenlist) print("-----------OpenUd---train---------") for tokenlist in parse_incr(train): train_data.append(tokenlist) ud_counter = defaultdict(Counter) ud_counter['en-ud-dev.conllu'] = dev_data ud_counter['en-ud-test.conllu'] = test_data ud_counter['en-ud-train.conllu'] = train_data return ud_counter
def parse_wsd_data(): # parse the EUD-EWT conllu files and retrieve the sentences # remove all punctuation? train_file = open("data/UD_English-EWT/en_ewt-ud-train.conllu", "r", encoding="utf-8") train_data = list(parse_incr(train_file)) # train_data = [[''.join(c for c in word.get('lemma') if c not in string.punctuation) for word in token_list] for token_list in train_data] # train_data = [[word for word in s if word] for s in train_data] print( 'Parsed {} training data from UD_English-EWT/en_ewt-ud-train.conllu.'. format(len(train_data))) test_file = open("data/UD_English-EWT/en_ewt-ud-test.conllu", "r", encoding="utf-8") test_data = list(parse_incr(test_file)) # test_data = [[''.join(c for c in word.get('lemma') if c not in string.punctuation) for word in token_list] for token_list in test_data] # test_data = [[word for word in s if word] for s in test_data] print('Parsed {} testing data from UD_English-EWT/en_ewt-ud-test.conllu.'. format(len(test_data))) dev_file = open("data/UD_English-EWT/en_ewt-ud-dev.conllu", "r", encoding="utf-8") dev_data = list(parse_incr(dev_file)) # dev_data = [[''.join(c for c in word.get('lemma') if c not in string.punctuation) for word in token_list] for token_list in dev_data] # dev_data = [[word for word in s if word] for s in dev_data] print( 'Parsed {} dev data from UD_English-EWT/en_ewt-ud-dev.conllu.'.format( len(dev_data))) # parse the WSD dataset wsd_data = [] # read in tsv by White et. al., 2016 with open('data/wsd/wsd_eng_ud1.2_10262016.tsv', mode='r') as wsd_file: tsv_reader = csv.DictReader(wsd_file, delimiter='\t') # store the data: ordered dict row for row in tsv_reader: # each data vector wsd_data.append(row) # make sure all data are parsed print('Parsed {} word sense data from White et. al., 2016.'.format( len(wsd_data))) return wsd_data, train_data, test_data, dev_data
def test_accuracy(test_file, probabilities): seen = 0 correct = 0 print("> Starting tests") with open(test_file, "r", encoding="utf-8") as tf: for sentence in parse_incr(tf): previous = None for word in sentence: if previous is not None: previous_pos = previous["upos"] pos = word["upos"] possibilities = ensure(probabilities, previous_pos) most_likely = possibilities[0][0] if pos == most_likely: correct += 1 seen += 1 previous = word print("< Finishing tests") print() print(f"Total: {seen} words") print(f"Correct: {correct} PoS") print(f"Accuracy: {correct * 100 / seen} %")
def non_projectivity_rate(train): sequences = [] for tokenlist in parse_incr(train): sequences.append(tokenlist) cpt = 0 non_proj = 0 taille = 0 taille2 = 0 # mot1 = [] # mot2 = [] taille2 = len(sequences) for i in range(len(sequences)): sentence = sequences[i] taille += len(sequences[i]) mot1 = [] mot2 = [] for j in range(len(sequences[i])): # taille2 += len(sequences[j]) token = sentence[j] if token['head'] != None: mot1.append(token['id']) mot2.append(token['head']) # print(len(mot1)) for k in range(len(mot1)): for l in range(1, len(mot1)): if mot2[l] > mot1[k] and mot2[l] < mot2[k]: non_proj += 1 break # return non_proj return float(non_proj / taille)
def main(): docids = [] file = open("yourfile.conllu", 'r', encoding="utf-8") for tokenlist in parse_incr(file): docids.append(tokenlist.metadata['sent_id']) trainfile = open("train.txt") sentences = open("sentences.txt", "w") num = 0 for line in trainfile: line = line.split("\t") if line[0] in docids: pass else: num = num + 1 #Remove annotations from data sent = line[2].replace(']', '') sent = sent.strip() sent = sent.split(" ") new_sent = [] for item in sent: if item.startswith("["): pass else: new_sent.append(item) cleansent = " ".join(new_sent) sentences.write(cleansent) sentences.write('\n') print(num)
def get_sentences(stressed=False): for fp in get_dataset_connlu_files(): first_tokenlist = True for tokenlist in parse_incr(fp): first_token = True sentence = '' for token in tokenlist: if not first_token or not first_tokenlist and tokenlist.metadata[ 'newpar id']: sentence += '\n' form = token['misc'][ 'StressedForm'] if stressed and 'misc' in token and token[ 'misc'] and 'StressedForm' in token['misc'] else token[ 'form'] if form == '<g/>': continue sentence += form try: no_space_after = token['misc']['SpaceAfter'] == 'No' except (TypeError, KeyError): no_space_after = False if not no_space_after: sentence += ' ' yield sentence first_token = False first_tokenlist = False
def split_to_conllu_corpus(input_file, output_dir_path, num_of_sent=1000): """ 말뭉치를 분할 :param input_file: :param output_dir_path: :param num_of_sent: :return: """ def open_output_file(_output_dir_path, _output_file_name, _file_index): return open(_output_dir_path + '\\' + _output_file_name + '_' + str(_file_index).zfill(3) + '.conllu', 'w', encoding='utf-8-sig') file_name = os.path.splitext(input_file.name) file_name = os.path.split(file_name[0]) output_file_name = file_name[1] file_index = 0 _num_of_sent = 0 output_file = open_output_file(output_dir_path, output_file_name, file_index) for tokenlist in parse_incr(input_file): if _num_of_sent >= num_of_sent: file_index += 1 _num_of_sent = 0 output_file = open_output_file(output_dir_path, output_file_name, file_index) _num_of_sent += 1 print(tokenlist.serialize().strip(), file=output_file) print(file=output_file)
def load(): import conllu data_file = open("cs-ud-train-l.conllu", "r", encoding="utf-8") # parse_incr is recommended for large files (more than 1 MB) # since it returns a generator, which is why a conversion to list tokenlist = list(conllu.parse_incr(data_file)) for sentence in tokenlist: for token in sentence: form_lemma = token['form'] + token['lemma'] for sentence in tokenlist: chain = [] for token in sentence: if token['head']: parent = sentence[token['head'] - 1] if token['deprel'] == "case" and parent['deprel'] == "nmod": chain.append(token) for sentence in tokenlist: for token in sentence: token['deprel'] = 'dep' with open('out.conllu', 'w', encoding='utf8') as f: f.writelines([sentence.serialize() + "\n" for sentence in tokenlist])
def tokenize(self, file_name): all_ud_tokens = [] all_ud_data = [] count_del, count_total = 0, 0 # Initialise all the trees and embeddings with open(file_name, "r", encoding="utf-8") as file: for token_list in parse_incr(file): ud_tokens = [] ud_data = [] for item in token_list: ud_tokens.append(item['form']) ud_data.append({ 'word': item['form'], 'pos': item['upostag'], 'head': item['head'], 'rel': item['deprel'], }) # If there are more than max_tokens tokens skip the sentence if len(ud_tokens) <= self.max_tokens: all_ud_tokens.append(ud_tokens) all_ud_data.append(ud_data) else: count_del += 1 count_total += 1 if count_del > 0: print('\n\n\tWarning!Removed %d (of %d) long sentences\n\n' % (count_del, count_total)) return all_ud_tokens, all_ud_data
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as conllu_file: logger.info("Reading UD instances from conllu dataset at: %s", file_path) for annotation in parse_incr(conllu_file): # CoNLLU annotations sometimes add back in words that have been elided # in the original sentence; we remove these, as we're just predicting # dependencies for the original sentence. # We filter by integers here as elided words have a non-integer word id, # as parsed by the conllu python library. annotation = [ x for x in annotation if isinstance(x["id"], int) ] heads = [x["head"] for x in annotation] tags = [x["deprel"] for x in annotation] words = [x["form"] for x in annotation] if self.use_language_specific_pos: pos_tags = [x["xpostag"] for x in annotation] else: pos_tags = [x["upostag"] for x in annotation] yield self.text_to_instance(words, pos_tags, list(zip(tags, heads)))
def download_embeddings(tmp_download_path, embeddings_save_path, dataset_file_paths): download_model('el', tmp_download_path, if_exists='ignore') ft = fasttext.load_model(f'{tmp_download_path}/cc.el.300.bin') if not dataset_file_paths: dataset_file_paths = [ f'data/ud/{ds}.conllu' for ds in ('train', 'dev', 'test') ] vocab = set() for p in dataset_file_paths: with open(p) as fr: for e in parse_incr(fr): for t in e: vocab.add(t['form'].lower()) word_vectors = [] i2w = list(vocab) for word in i2w: word_vectors.append(ft.get_word_vector(word)) word_vectors = [[0] * len(word_vectors[0])] + word_vectors i2w = ['<PAD>'] + i2w w2i = {w: i for i, w in enumerate(i2w)} with open(embeddings_save_path, 'wb') as fw: pickle.dump((np.array(word_vectors), w2i, i2w), fw)
def write_file_for_main_POS(): """ To extract sentences where all POS of a certain type are grouped. :return: """ tags_of_interest = [ 'ADJ', 'ADV', 'NOUN', 'PRON', 'VERB', 'AUX', 'DET', 'PROPN' ] out_file_path = os.path.basename(path_to_conllu_file)[:-7] + '_POS.csv' with open('data/' + out_file_path, 'w') as outfile: writer = csv.writer(outfile) writer.writerow([ '{}|{} {}'.format('#' if i == 0 else '', i, t) for i, t in enumerate(tags_of_interest) ]) for sentence in parse_incr( open(path_to_conllu_file, "r", encoding="utf-8")): token_forms = [] for token in sentence: if token["upostag"] in tags_of_interest: index = tags_of_interest.index(token["upostag"]) token_forms.append('|{} {} |'.format( index, token["form"].replace('|', ''))) else: token_forms.append(token["form"]) writer.writerow([' '.join(token_forms)])
def load_check_data(check_file): with open(check_file, 'r', encoding='utf-8') as cf: conllulist = [] for tokenlist in cl.parse_incr(cf): conllulist.append(tokenlist) sentlist = [[token['form'] for token in sent] for sent in conllulist] return conllulist, sentlist
def parse_and_extract(conllu_path) -> List[Tuple[Inp, Out]]: """Parse a CoNLL-U file and return the list of input/output pairs.""" data = [] with open(conllu_path, "r", encoding="utf-8") as data_file: for token_list in conllu.parse_incr(data_file): # type: ignore data.append(extract(token_list)) return data
def extract_sentences(path, filenames, split='train'): """Extract word-tag lists of sentences from downloaded conllu files. Args: path (str): The path to the stored data. filenames (str): The name of the files for the current treebank. split (optional, str): The split to be returned:`train`,`test` or `dev`. Returns: (list): A list of length equal to the number of sentences, where each element is a dict containing two lists: one with the words and one with the corresponding tags. """ path = os.path.join(path, '%s%s.conllu' % (filenames, split)) data_file = open(path, "r", encoding="utf-8") sentences = [] for tokenlist in parse_incr(data_file): words = [] tags = [] for t in list(tokenlist): words.append(t['form']) tags.append(t['upos']) sentences.append({'words': words, 'udtag': tags}) data_file.close() return sentences
def extact(test_file): all_sent = [] data_test = open(test_file, "r", encoding="utf-8") for tokenlist in parse_incr(data_test): all_sent.append(tuple(dict(tokenlist.metadata).values())[1]) print('Test sentences are extracted') return all_sent
def get_umbc_dict(): dict_filen = os.path.join(proj_dir, 'umbc_freq.pkl') if os.path.exists(dict_filen): return pickle.load(open(dict_filen, mode='rb')) umbc_dir = '/mnt/store/home/makrai/data/language/english/corp/umbc_WebBase/English/' freq = defaultdict(int) for filen in glob.glob( os.path.join(umbc_dir, 'en-common_crawl-*.conllu.xz')): logging.info(filen) for i, sentence in enumerate( parse_incr( lzma.open(os.path.join(umbc_dir, filen), mode='rt', encoding="utf-8"))): if not i % 100000: logging.debug(i) root = sentence.to_tree() subj, obj = '', '' for child in root.children: if 'subj' in child.token['deprel']: if subj: #logging.warn('subj: {}'.format((subj, child.token['lemma'], sentence))) continue subj = child.token['lemma'] elif child.token['deprel'] == 'obj': if obj: #logging.warn('obj: {}'.format((obj, child.token['lemma'], sentence))) continue obj = child.token['lemma'] #if bool(obj) and bool(subj): freq[(subj, root.token['lemma'], obj)] += 1 pickle.dump(freq, open(dict_filen, mode='wb')) return freq
def import_data(filepath): ''' Imports the data from the specific .conllu file supplied. Parameters: filepath (str): Filepath to conllu file Returns: sentences (list<str>): A list of sentences sentence_tags (list<str>): A list of tags ''' data_file = open(filepath, mode="r", encoding="utf8") tokenlist = list(parse_incr(data_file)) tagged_sentences = [] for sentence in tokenlist: tmp = [] for token in sentence: tmp.append((token["form"], token["upos"])) tagged_sentences.append(tmp) sentences, sentence_tags = [], [] for tagged_sentence in tagged_sentences: sentence, tags = zip(*tagged_sentence) sentences.append(np.array(sentence)) sentence_tags.append(np.array(tags)) return sentences, sentence_tags
def read_conllu_outputs(reference_path, hypothesis_path): translations = [] with open(reference_path, 'r', encoding="utf-8") as ref, open(hypothesis_path, 'r', encoding="utf-8") as hyp: for ref_tokenlist, hyp_tokenlist in zip(parse_incr(ref), parse_incr(hyp)): ref_sentence = Sentence() hyp_sentence = Sentence() for token in ref_tokenlist: ref_sentence.words.append(Word(token["form"], token["lemma"])) for token in hyp_tokenlist: hyp_sentence.words.append(Word(token["form"], token["lemma"])) translations.append(Translation(ref_sentence, hyp_sentence)) return translations
def raw_text(input_file: str, output_file: str = None, gpu: bool = False, time: bool = True, memory: bool = True, ner: bool = True, model_name: str = "hu_core_news_lg"): nlp = load_pipeline(gpu, ner, model_name) if output_file: nlp.add_pipe("conll_formatter") data_file = open(input_file, "r", encoding="utf-8") sentences = list(parse_incr(data_file)) texts = " ".join([s.metadata["text"] for s in sentences]) if time: with Timer() as t: res = nlp(texts) print(f'Time spent: {t.elapsed:.2f} seconds') else: res = nlp(texts) if output_file: with open(output_file, 'w', encoding='utf-8') as writer: # noinspection PyProtectedMember print(rename_root(res._.conll_str), sep="\n", file=writer) if memory: print(f'Maximum memory usage: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MiB')
def write_file_for_open_vs_closed_POS(): """ To extract sentences where open/closed class POS tags are grouped :return: """ out_file_path = os.path.basename( path_to_conllu_file)[:-7] + '_open-closed.csv' with open('data/' + out_file_path, 'w') as outfile: writer = csv.writer(outfile) writer.writerow([ '{}|{} {}'.format('#' if i == 0 else '', i, t) for i, t in enumerate(['open', 'closed']) ]) for sentence in parse_incr( open(path_to_conllu_file, "r", encoding="utf-8")): token_forms = [] for token in sentence: if token["upostag"] in CONLLU_tags.open_class_tags: token_forms.append('|{} {} |'.format( 0, token["form"].replace('|', ''))) elif token["upostag"] in CONLLU_tags.closed_class_tags: token_forms.append('|{} {} |'.format( 1, token["form"].replace('|', ''))) else: token_forms.append(token["form"]) writer.writerow([' '.join(token_forms)])
def write_file_for_nominal_core_args(): """ Extract just the 'nominal core args', i.e., subject and objects. :return: """ out_file_path = os.path.basename( path_to_conllu_file)[:-7] + '_nom_args.csv' with open('data/' + out_file_path, 'w') as outfile: writer = csv.writer(outfile) writer.writerow([ '{}|{} {}'.format('#' if i == 0 else '', i, t) for i, t in enumerate(CONLLU_tags.nominal_core_arguments) ]) for sentence in parse_incr( open(path_to_conllu_file, "r", encoding="utf-8")): token_forms = [] for token in sentence: if token["deprel"] in CONLLU_tags.nominal_core_arguments: index = CONLLU_tags.nominal_core_arguments.index( token["deprel"]) token_forms.append('|{} {} |'.format( index, token["form"].replace('|', ''))) else: token_forms.append(token["form"]) writer.writerow([' '.join(token_forms)])
def conllu2arr(direc): #Draws conllu files from direc and sorts them into the respective arrays dir = direc for root, dirs, files in os.walk(dir): for file in files: if ".conllu" in file: doc = [] conllufile = file #print(conllufile) filepath = os.path.join(root, conllufile) datafile = open(filepath, "r", encoding="utf-8") for tokenlist in conllu.parse_incr(datafile): for token in tokenlist: if token["form"] not in punct: doc.append(token["form"]) #print(doc) if "kongzi" in file: kongzi.append(doc) elif "mengzi" in file: mengzi.append(doc) elif "liuxiang" in file: liuxiang.append(doc) elif "dongzhongshu" in file: dongzhongshu.append(doc) elif "zhuangzi" in file and "outer" not in file: zhuangzi.append(doc) elif "outer" in file: zhuangzi_test.append(doc)
def parse_corpus(filename): data_file = open(filename, encoding="utf-8") ud_parses = list(parse_incr(data_file)) return ud_parses
def pearson_baseline(path): sentences = [] for s in parse_incr(open(path, "r", encoding="utf-8")): sentences.append(s) baseline_left_score = [] baseline_right_score = [] gold_score = [] for i, s in enumerate(sentences): arcs = tree_utils.conllu_to_arcs(s.to_tree()) nodes = list(set([a[j] for j in [0, 1] for a in arcs])) nodes.sort() baseline_left = [(nodes[i], nodes[i - 1]) for i in range(1, len(nodes))] baseline_right = [(nodes[i - 1], nodes[i]) for i in range(1, len(nodes))] baseline_left_matrix = -tree_utils.arcs_to_distance_matrix( baseline_left) baseline_right_matrix = -tree_utils.arcs_to_distance_matrix( baseline_right) baseline_left_matrix_bidir = -tree_utils.arcs_to_distance_matrix( baseline_left, bidirectional=True) baseline_right_matrix_bidir = -tree_utils.arcs_to_distance_matrix( baseline_right, bidirectional=True) gold_matrix = -tree_utils.arcs_to_distance_matrix(arcs) gold_matrix_bidir = -tree_utils.arcs_to_distance_matrix( arcs, bidirectional=True) pearson_left = tree_utils.pearson_scores(baseline_left_matrix, s) pearson_left_bidir = tree_utils.pearson_scores( baseline_left_matrix_bidir, s) pearson_right = tree_utils.pearson_scores(baseline_right_matrix, s) pearson_right_bidir = tree_utils.pearson_scores( baseline_right_matrix_bidir, s) pearson_gold = tree_utils.pearson_scores(gold_matrix, s) pearson_gold_bidir = tree_utils.pearson_scores(gold_matrix_bidir, s) baseline_left_score.append(pearson_left + pearson_left_bidir) baseline_right_score.append(pearson_right + pearson_right_bidir) gold_score.append(pearson_gold + pearson_gold_bidir) print( "PEARSON BASELINES: (plain; irreflexive; bidirectional dep; bidir-irrefl) (same for bidirectional both sides)" ) for label, scores in zip( ["LEFT", "RIGHT", "GOLD"], [baseline_left_score, baseline_right_score, gold_score]): print(" " + label) means = [np.nanmean(score) for score in zip(*scores)] print(' ' + '\n '.join([ '{} ({})'.format(a.round(2), b.round(2)) for a, b in zip(means[::2], means[1::2]) ]))
def load_conllu(filepath): sentences = [] print("loading sentences from {}".format(filepath)) with open(filepath) as f: for s in tqdm(conllu.parse_incr(f)): if len(s) > 1: sentences.append(s) return sentences
def _read(self, file_path: str) -> Iterator[Instance]: """ Creates and iterator over instances from a file path """ with open(file_path, 'r', encoding='utf-8') as f: for token_list in conllu.parse_incr(f): sentence = [token['form'] for token in token_list] pos_tags = [token['upostag'] for token in token_list] yield self.text_to_instance([Token(word) for word in sentence], pos_tags)
def write_cleaned_version(output_filename, input_parsed_file, whitelist_sent_id): outfile = open(output_filename, "w+", encoding="utf-8") with open(input_parsed_file, "r", encoding="utf-8") as infile: for sentence in conllu.parse_incr(infile): if sentence.metadata["sent_id"] in whitelist_sent_id: outfile.writelines(sentence.serialize()) outfile.close()
def parse_corpus(filename: str) -> List[TokenList]: """ Parses a file into a collection of TokenLists """ data_file = open(filename, encoding="utf-8") ud_parses = list(parse_incr(data_file)) return ud_parses