def beforeTurbo(data): print('parsing data as trees...') trees = [tree for tree in parse_tree_incr(data)] cleared_trees = [clear_tree(tree) for tree in trees] texts = [tree.serialize() for tree in cleared_trees] file = open("tmp/cleared.conll", 'w') for text in texts: file.write(text) file.write(u"") print("ready for TurboTesting")
def main(): print("opening training data...") data = open("../PerDT/Data/train.conll", 'r') print('parsing data as trees...') trees = [tree for tree in parse_tree_incr(data)] print("clearing the trees of data") for tree in trees: clear_tree(tree) print("serializing the trees") texts = [tree.serialize() for tree in trees] file = open("tmp/cleared-train.conll", 'w') for text in texts: file.write(text) file.write(u"") print("finished")
def parse_all_sentences(): for tokentree in conllu.parse_tree_incr(data_file): relevance_flag = BFS(tokentree, check_if_relevant_noun) if relevance_flag: list_of_relevant_roots.append(tokentree) with open('relevant sentences.txt', 'w', encoding="utf-8") as filehandle: for listitem in list_of_relevant_roots: back_to_conll = listitem.serialize() # text = back_to_conll.metadata filehandle.write('----- new sentence -----\n') # filehandle.write('%s\n' % text) filehandle.write('%s\n' % back_to_conll) filehandle.write('----- sentence end -----\n') print("relevants = ", list_of_relevant_roots) print("num of relevants = ", len(list_of_relevant_roots))
def generate(self): """ generates the dataset and writes to the output file. output format: full_sentence \t split1 <::::> split2 """ i = 0 output_file = open(self.output_path_, 'wb') for line in conllu.parse_tree_incr(open(self.input_, 'r')): chunks = [] full_sentence = self.to_string(line.children, [ (line.__dict__['token']['form'], line.__dict__['token']['id']) ]).strip() for child in line.children: if child.__dict__['token']['deprel'].lower( ) not in self.ignore_projections_: chunks.append( self.to_string(child.children, [(line.__dict__['token']['form'], line.__dict__['token']['id']), (child.__dict__['token']['form'], child.__dict__['token']['id'])])) splits = self.split_(chunks) [ output_file.write(" {0} \t {1} \n".format( full_sentence, split).encode()) for split in splits ] logging.info("[WROTE] : {0}th sentence ".format(i)) i += 1 output_file.close()
def __init__(self, fname, embed=None, device=None, max_len=1e3, pos_to_id_dict=None, read_tree=False): super(ConlluData, self).__init__() self.device = device if pos_to_id_dict is None: pos_to_id = defaultdict(lambda: len(pos_to_id)) else: pos_to_id = pos_to_id_dict text = [] tags = [] trees = [] heads = [] embedding = [] right_num_deps = [] left_num_deps = [] deps = [] fin = open(fname, "r", encoding="utf-8") fin_tree = open(fname, "r", encoding="utf-8") data_file_tree = parse_tree_incr(fin_tree) data_file = parse_incr(fin) for id_, (sent, tree) in enumerate(zip(data_file, data_file_tree)): sent_list = [] tag_list = [] head_list = [] right_num_deps_ = [] left_num_deps_ = [] sent_n = [] deps_list = [] # delete multi-word token for token in sent: if isinstance(token["id"], int): sent_n += [token] for token in sent_n: sent_list.append(token["form"]) pos_id = pos_to_id[token[ "upostag"]] if token["upostag"] != '_' else pos_to_id["X"] tag_list.append(pos_id) # -1 represents root head_list.append(token["head"] - 1) deps_list.append(token["deprel"]) if len(tag_list) > max_len: continue right_num_deps_ = [0] * len(head_list) left_num_deps_ = [0] * len(head_list) for i, head_id in enumerate(head_list): if head_id != -1: if i < head_id: left_num_deps_[head_id] += 1 elif i > head_id: right_num_deps_[head_id] += 1 else: raise ValueError("head is itself !") text.append(sent_list) if embed is not None: embedding.append(self.text_to_embed(id_, sent_list, embed)) tags.append(tag_list) heads.append(head_list) right_num_deps.append(right_num_deps_) left_num_deps.append(left_num_deps_) trees.append(tree) deps.append(deps_list) self.trees = trees self.text = text self.embed = embedding self.postags = tags self.heads = heads self.deps = deps self.right_num_deps = right_num_deps self.left_num_deps = left_num_deps self.pos_to_id = pos_to_id self.id_to_pos = {v: k for (k, v) in pos_to_id.items()} self.length = len(self.text) # if embed is not None: # self.text_to_embed(embed) fin.close() fin_tree.close()
sentence.metadata ### Turn a TokenList back into CoNLL-U sentence.serialize() # The format is not desirable ### Turn a Tokenlist into a TokenTree sentence.to_tree() ### Use parse_tree() to parse into a list of dependency trees from conllu import parse_tree sentences = parse_tree(data) sentences from conllu import parse_tree_incr for tokentree in parse_tree_incr(data_file): print(tokentree) root = sentences[0] root root.print_tree() root.token children = root.children children root.metadata root.serialize()
def test_parse_tree_incr(self): self.assertEqual(parse_tree(data), list(parse_tree_incr(StringIO(data))))
def test_parse_tree_incr(self): self.assertEqual(parse_tree(data), list(parse_tree_incr(string_to_file(data))))
print( "Usage: python conllu_to_docs.py UD-conllu-file.conllu output_folder/") path_to_udfile = sys.argv[1] outpath = sys.argv[2] if not os.path.exists(outpath): os.mkdir(outpath) outfile = None numdocs = 0 with open(path_to_udfile, "r", encoding="utf-8") as data_file: for sentence in parse_tree_incr(data_file): md = sentence.metadata if "newdoc id" in md: # close the last one if outfile is not None: outfile.close() # open a new one docid = md["newdoc id"] outfile = open(os.path.join(outpath, docid), 'w') numdocs += 1 # write the current sentence outfile.write(sentence.serialize()) print(f"Wrote out {numdocs} docs to {outpath}")
def readConllu(filename): data_file = open(filename, "r", encoding="utf-8") for tokentree in parse_tree_incr(data_file): print(tokentree)