def read_conll(treebank_path, langs, code_to_lang, train_or_dev, tgt_size=None, test=False): """ Reads conll formatted file langs: list of languages train: read training data returns: dict with data for each language as list of tuples of sentences and morph-tags """ annot_sents = {} unique = [] for lang in langs: train = train_or_dev if not test else "test" if not test: filepath = treebank_path + "UD_" + code_to_lang[lang] + '/' + lang + "-ud-" + train + ".conllu" else: filepath = treebank_path + "UD_" + code_to_lang[lang] + '/' + lang + "-ud-test.conllu" with open(filepath) as f: data = f.readlines()[:-1] data = [line for line in data if line[0]!='#'] split_data = " ".join(data).split("\n \n") ud = [parse(sent)[0] for sent in split_data] all_text = [] all_tags = [] if langs[-1]==lang and tgt_size: tgt_size = min(tgt_size, len(ud)) ud = ud[:tgt_size] for sent in ud: sent_text = [] sent_tags = [] for word in sent: word_tags = {} if word['feats']: word_tags = dict(word['feats']) if word['upostag']: if word_tags: word_tags.update({'POS':word['upostag']}) else: word_tags = {'POS':word['upostag']} if word_tags: word_tags = freeze_dict(word_tags) if word_tags not in unique: unique.append(word_tags) sent_text.append(word['form']) sent_tags.append(freeze_dict(word_tags)) all_text.append(sent_text) all_tags.append(sent_tags) annot_sents[lang] = [(w, m) for w, m in zip(all_text, all_tags)] return annot_sents, unique
def parse(self, data): sentences_raw = UD_PARSER.parse(data) sentences_parsed = conllu_parser.parse(sentences_raw) sentences = self.to_json(sentences_parsed) return OrderedDict([ ("sentences", sentences), ])
def load_conllu(self, filename): ''' Загрузка файла в формате conllu и его парсинг. Для парсинга файлов с менее чем 10 колонками использовался этот парсер: https://github.com/svetlana21/conllu ''' with open(filename, 'r', encoding='utf-8') as f: data = f.read() result = parse(data) return result
def parse_sent(inf, outf, return_tree=True): # read configs and command line options config = configparser.ConfigParser() config.read('config.ini') in_fname, out_fname = inf, outf check_infile(in_fname) fname_clean = os.path.basename(in_fname).rsplit('.', 1)[0] # temporary files and folder tmp_path = get_path_from_config(config, 'TMP_PATH', 'tmp') tmp_fsuffixes = [ '_mystem_in.txt', '_mystem_out.txt', '_treetagger_in.txt', '_treetagger_out.txt', '_raw.conll' ] a, b, c, d, e = (PurePosixPath(j) for j in tmp_path.split('/')) tmp_fnames = [ str(a / b / c / d / e / (fname_clean + fsuffix)) for fsuffix in tmp_fsuffixes ] # output file and folder out_path = get_path_from_config(config, 'OUT_PATH', 'out') a, b, c, d, e = (PurePosixPath(j) for j in out_path.split('/')) if out_fname is None: out_fname = str(a / b / c / d / e / (fname_clean + '.conll')) else: out_fname = str(a / b / c / d / e / out_fname) # create output and temp folder if needed for path in [tmp_path, out_path]: if not os.path.exists(path): os.makedirs(path) # rock'n'roll process(in_fname, out_fname, config['DEFAULT']['APP_ROOT'], config['mystem']['MYSTEM_PATH'], config['malt']['MALT_ROOT'], config['malt']['MALT_NAME'], config['malt']['MODEL_NAME'], config['dicts']['COMP_DICT_PATH'], config['treetagger']['TREETAGGER_BIN'], config['treetagger']['TREETAGGER_PAR'], *tmp_fnames) for fname in tmp_fnames: os.remove(fname) with open(out_fname, 'r', encoding='utf-8') as conll_file: conll_data = conll_file.read() conll_file.close() os.remove(out_fname) if return_tree: return parse_tree(conll_data) return parse(conll_data)
def load_from_conll_u_file(self, file_path): tokens = [] with open(file_path) as input_file: data = input_file.read() paresed_data = parse(data) total_counter = 0 total_span_counter = 0 for sent_counter, sentence in enumerate(paresed_data): for token_counter, token in enumerate(sentence): new_token = Token(total_counter, token['form'], total_span_counter, total_span_counter + len(token['form'])) total_span_counter += len(token['form']) + 1 if token_counter == 0: new_token.add_a_label('SentenceBegin', str(sent_counter)) if token['lemma']: new_token.add_a_label('lemma', token.get('lemma')) if token['upostag']: new_token.add_a_label('upostag', token.get('upostag')) if token['xpostag']: new_token.add_a_label('xpostag', token.get('xpostag')) feats = token['feats'] if feats: for feat in feats: new_token.add_a_label('feat-' + feat, feats[feat]) if token['head']: new_token.add_a_label('head', str(token.get('head'))) if token['deprel']: new_token.add_a_label('deprel', str(token.get('deprel'))) if token['deps']: new_token.add_a_label('deps', token.get('deps')) misc = token['misc'] if misc: for feat in misc: new_token.add_a_label('misc-' + feat, misc[feat]) tokens.append(new_token) total_counter += 1 return tokens
def read_conll(langs, code_to_lang, train_or_dev, tgt_size=None, test=False): """ Reads conll formatted file """ treebank_path = "/projects/tir2/users/cmalaviy/ud_exp/ud-treebanks-v2.0/" test_treebank_path = "/projects/tir2/users/cmalaviy/ud_exp/ud-test-v2.0-conll2017/input/conll17-ud-test-2017-05-09/" # treebank_path = "/projects/tir2/users/cmalaviy/ud_exp/ud-treebanks-conll2017/" annot_sents = {} for lang in langs: sent_text = [] lemmas = [] train = train_or_dev if not test else "test" if not test: filepath = treebank_path + "UD_" + code_to_lang[ lang] + '/' + lang + "-ud-" + train + ".conllu" else: filepath = test_treebank_path + lang + "-udpipe.conllu" with open(filepath) as f: data = f.readlines()[:-1] data = [line for line in data if line[0] != '#'] split_data = " ".join(data).split("\n \n") ud = [parse(sent)[0] for sent in split_data] if langs[-1] == lang and tgt_size: tgt_size = min(tgt_size, len(ud)) ud = ud[:tgt_size] for sent in ud: for word in sent: lemmas.append(word['lemma'] + "\n") sent_text.append(word['form'] + "\n") #lemmas.append(" ".join([w for w in word['lemma']]).encode('utf8') + "\n") #sent_text.append(" ".join([w for w in word['form']]).encode('utf8') + "\n") with open("lemma-words/" + lang + "_words.txt", 'w') as f: f.writelines(sent_text) with open("lemma-words/" + lang + "_lemmas.txt", 'w') as f: f.writelines(lemmas) annot_sents[lang] = [(w, m) for w, m in zip(sent_text, lemmas)] return annot_sents
def __init__(self, file_path): if os.path.exists(self.tw_object_file_path) and os.path.exists( self.tw_object_file_path) and os.path.exists( self.ttt_object_file_path): self.read_external_file() else: corpus_file = open(file_path, 'r') corpus_data = re.sub(r" +", r"\t", corpus_file.read()) sentence_list = parse(corpus_data) i = 0 for sentence in sentence_list: i = i + 1 print 'start' + str(i) tag1 = Tag.NOTEXIST tag2 = Tag.NOTEXIST for token in sentence: word = token['form'] tag_detector = Tag.NOTEXIST tag = tag_detector.get_tag(token['upostag']) if (i == 12541): print word self.update_word_dict(word) self.update_tag_word_dict(str(tag) + '|' + word) ttt_tag_key = str(tag) + '|' + str(tag1) + '|' + str(tag2) tt_tag_key = str(tag1) + '|' + str(tag2) self.update_ttt_dict(ttt_tag_key) self.update_tt_dict(tt_tag_key) self.update_t_dict(str(tag)) #update tag tag2 = tag1 tag1 = tag print 'finish ' + str(i) self.count_tw_prob_dict() self.count_wt_prob_dict() self.count_ttt_prob_dict() self.write_external_file()
def parse_sentence_conllu(conllu_raw_string): #list of POS access: parsed[id_sent][id_word][x] x=0-> form,x = 1=pos conllu_list_raw = conllu_raw_string.replace("\t", " ").split("\n\n") conllu_list_final = [] for clr in conllu_list_raw: if len(clr) > 2: clr_part = clr.split("\n") conllu_final = "" for cline in range(2, len(clr_part)): conllu_final += clr_part[cline] + "\n" parsed = parse(conllu_final) conllu_list_final.append([]) for idx in range(0, len(parsed[0])): par = [] par.append(parsed[0][idx]['form']) par.append(parsed[0][idx]['upostag']) conllu_list_final[-1].append(par) return conllu_list_final
#!/usr/bin/python3 # main.py import sys from conllu.parser import parse from graph import Graph if len(sys.argv) != 2: raise Exception('Invalid Input') data = parse(open(sys.argv[1], 'r').read()) g = Graph(len(data[0])+1) for word in data[0]: g.add_edge(word['head'], word['id']) if g.is_cyclic(): print("Cycle found with the following path: {0}".format(g.rec_path)) else: print("No cycles found.")
candidate_tags = upostag_dict[form] tag = max(candidate_tags, key=candidate_tags.get) # print('Candidate: %s => %s' % (candidate_tags, tag)) else: # assume OOV word as noun tag = 'NOUN' return tag # for consistency # SEED = 7 # read training data from file with open('id-ud-train.conllu', 'r') as f: raw_train_data = f.read() full_train_data = parse(raw_train_data) num_full_train_data = len(full_train_data) print("Num full train data: %s" %num_full_train_data) # create validation data by splitting training data # the split: 90% training data : 10% validation data # random.seed(SEED) random.shuffle(full_train_data) validation_data = full_train_data[0:559] train_data = full_train_data[559:] num_validation_data = len(validation_data) print("Num validation data: %s" %num_validation_data) num_train_data = len(train_data)
def test_parse_data4(self): self.assertEqual(parse(data4), data4_flat)
def test_parse_data3(self): self.assertEqual(parse(data3), data3_flat)
def test_parse_data2(self): self.assertEqual(parse(data2), data2_flat)
def test_parse_only_id_data1(self): ids = [parsed_line["id"] for parsed_line in parse(data1, fields=["id"])[0]] num_lines = len(data1.strip().split("\n")) self.assertEqual(ids, list(range(1, num_lines + 1)))
def test_parse_data1(self): self.assertEqual(parse(data1), data1_flat)
def read_external_file(self, file_path): corpus_file = open(file_path, 'r') corpus_data = re.sub(r" +", r"\t", corpus_file.read()) sentence_list = parse(corpus_data) return sentence_list
from conllu.parser import parse import sys from sklearn.externals import joblib from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score #import os #os.chdir('Desktop\Informatika\Semester_7\IF4072_NLP\IF4072\Pos Tagging') #data = open('id-ud-train.conllu', mode='r', encoding='utf-8').read() # For Python 3 data = open('id-ud-train.conllu', mode='r').read() data_parsed = parse(data) feature_data = [] target_data = [] for i in range(0, len(data_parsed)): for j in range(0, len(data_parsed[i])): # current word form = data_parsed[i][j].get('form') # word before and its POS tag word_before = "" postag_before = "" if (data_parsed[i][j].get('id') == 1): word_before = "Null" postag_before = "Null" else: word_before = data_parsed[i][j - 1].get('form') postag_before = data_parsed[i][j - 1].get('upostag')
def test_parse_data7(self): parse(data7)
def test_parse_data6(self): self.assertEqual(parse(data6), data6_flat)
def conllu_to_tokens(conllu): """Extract tokens from ConLL-U.""" for sentence in parse(conllu): for word in sentence: yield word
def test_parse(self): from tests.fixtures.data1_flat import data1_expected self.assertEqual(parse(data1), data1_expected)