def __init__(self, basedir=None): if basedir == None: basedir = self.default_basedir self.basedir = basedir self.reader = BracketParseCorpusReader(basedir, 'negra-corpus2.penn', comment_char='%')
def write_to_file(orig_file, revised_file, outfile=None, no_heuristic_mismatch_fix=False): reader = BracketParseCorpusReader('.', []) orig_trees = reader.parsed_sents(orig_file) revised_trees = reader.parsed_sents(revised_file) # The revised PTB parses have one less tree in the training split. # This attempts to patch the problem by skipping this tree. if not no_heuristic_mismatch_fix: orig_trees = list(orig_trees) revised_trees = list(revised_trees) if len(orig_trees) == 39832 and len(revised_trees) == 39831: del orig_trees[4906] converted_trees = convert_to_revised_tokenization(orig_trees, revised_trees) if outfile is None: for tree in converted_trees: print(tree.pformat(margin=1e100)) else: with open(outfile, 'w') as f: for tree in converted_trees: tree_rep = tree.pformat(margin=1e100) assert ('\n' not in tree_rep) f.write(tree_rep) f.write("\n")
def get_id_list(target_root, splits): res = [] for fname in glob_tree_files(target_root, splits): reader = BracketParseCorpusReader('.', [fname]) num_sents = len(reader.parsed_sents()) doc_id = os.path.splitext(os.path.split(fname)[-1])[0] for sent_id in range(num_sents): sent_id = "{}_{:03}".format(doc_id, sent_id) res.append((doc_id, sent_id)) return res
def _get_trees(path: str) -> Iterator[list]: """Takes either a directory of .mrg files or a single .txt file.""" if os.path.isdir(path): fileids = r"wsj_.*\.mrg" reader = BracketParseCorpusReader(path, fileids) yield from reader.parsed_sents() else: with open(path) as fh: for line in fh.read().split("\n\n"): yield Tree.fromstring(line)
def write_corpus_file(dirname): """Takes all .mrg PTB files in a directory and puts them in a single file. This allows for faster retrieval from disk.""" fileids = r"wsj_.*\.mrg" reader = BracketParseCorpusReader(dirname, fileids) text = "\n\n".join(str(tree) for tree in reader.parsed_sents()) filename = dirname + ".txt" with open(filename, "w") as fh: fh.write(text)
def write_to_file(data_root, splits, outfile, add_top=False): reader = BracketParseCorpusReader('.', glob_files(data_root, splits)) with open(outfile, 'w') as f: for tree in reader.parsed_sents(): tree_rep = tree.pformat(margin=1e100) if add_top: tree_rep = "(TOP %s)" % tree_rep assert ('\n' not in tree_rep) f.write(tree_rep) f.write("\n")
class Negra(treebank.SavedTreebank): default_basedir = 'negra-corpus' trees = [] filename = 'negra.treebank' def __init__(self, basedir=None): if basedir == None: basedir = self.default_basedir self.basedir = basedir self.reader = BracketParseCorpusReader(basedir, 'negra-corpus2.penn', comment_char='%') def parsed(self, files=None): #for t in treebank.SavedTreebank.parsed(self, files): for (i, t) in itertools.izip(itertools.count(), self.reader.parsed_sents()): yield NegraTree(t, labels=i) def get_tree(self, offset=0): t = self.get_trees2(offset, offset+1)[0] return t # Devuelve los arboles que se encuentran en la posicion i con start <= i < end def get_trees2(self, start=0, end=None): lt = [t for t in itertools.islice(self.parsed(), start, end)] return lt def is_ellipsis(self, s): return is_ellipsis(s) def is_punctuation(self, s): return is_punctuation(s)
class Corpus(object): def __init__(self, root, fids='.*', tag_taxonomy=None, headship=None): self.corpus = BracketParseCorpusReader(root=root, fileids=fids) self._corpus_iter = (sent for sent in self.corpus.parsed_sents()) def __iter__(self): return self def next(self): try: return self._corpus_iter.next() except StopIteration: self._corpus_iter = (sent for sent in self.corpus.parsed_sents()) raise
class Negra(treebank.SavedTreebank): default_basedir = 'negra-corpus' trees = [] filename = 'negra.treebank' def __init__(self, basedir=None): if basedir == None: basedir = self.default_basedir self.basedir = basedir self.reader = BracketParseCorpusReader(basedir, 'negra-corpus2.penn', comment_char='%') def parsed(self, files=None): # for t in treebank.SavedTreebank.parsed(self, files): for (i, t) in itertools.izip(itertools.count(), self.reader.parsed_sents()): yield NegraTree(t, labels=i) def get_tree(self, offset=0): t = self.get_trees2(offset, offset + 1)[0] return t # Devuelve los arboles que se encuentran en la posicion i con start <= i < end def get_trees2(self, start=0, end=None): lt = [t for t in itertools.islice(self.parsed(), start, end)] return lt def is_ellipsis(self, s): return is_ellipsis(s) def is_punctuation(self, s): return is_punctuation(s)
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) files = [] for r, d, f in os.walk(file_path): for file in f: if '.tree' in file: files.append(os.path.join(r, file)) for f in files: directory, filename = os.path.split(f) logger.info("Reading instances from lines in file at: %s", file_path) for parse in BracketParseCorpusReader(root=directory, fileids=[filename ]).parsed_sents(): self._strip_functional_tags(parse) # This is un-needed and clutters the label space. # All the trees also contain a root S node. if parse.label() == "VROOT" or parse.label() == "TOP": parse = parse[0] pos_tags = [x[1] for x in parse.pos() ] if self._use_pos_tags else None yield self.text_to_instance(parse.leaves(), pos_tags)
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) directory, filename = os.path.split(file_path) logger.info("Reading instances from lines in file at: %s", file_path) for parse in BracketParseCorpusReader(root=directory, fileids=[filename]).parsed_sents(): pos_tags = [x[1] for x in parse.pos()] if self._use_pos_tags else None yield self.text_to_instance(parse.leaves(), pos_tags, parse)
def get_trees_from_bracket_file(filename) -> List[Tree]: directory, filename = os.path.split(filename) trees = list(BracketParseCorpusReader(root=directory, fileids=[filename]).parsed_sents()) modified_trees = [] for tree in trees: strip_functional_tags(tree) # This is un-needed and clutters the label space. # All the trees also contain a root S node. if tree.label() == "VROOT" or tree.label() == "TOP": tree = tree[0] modified_trees.append(tree) return modified_trees
PCFG_UNARY_RULES_DICT_FILE = os.path.join(MODEL_DIR, "PCFG_unary_dict.pkl") PCFG_BINARY_RULES_DICT_FILE = os.path.join(MODEL_DIR, "PCFG_binary_dict.pkl") PCFG_POSTAGS_DICT_FILE = os.path.join(MODEL_DIR, "PCFG_postags_dict.pkl") PCFG_NT_SET_FILE = os.path.join(MODEL_DIR, "PCFG_non_terminals_set.pkl") PCFG_T_SET_FILE = os.path.join(MODEL_DIR, "PCFG_terminals_set.pkl") PCFG_POSTAGS_SET_FILE = os.path.join(MODEL_DIR, "PCFG_postags_set.pkl") # In[2]: t0 = time() print(">>> Reading corpus treebanks from file...") corpus_root = sys.argv[1] train_file_pattern = r".*_train.tb" ptb_train = BracketParseCorpusReader(corpus_root, train_file_pattern) print(">>> Corpus treebanks read done in %0.3fs.\n" % (time() - t0)) # In[3]: t0 = time() print(">>> Parsing collection of rules and words...") # Objects for unary rules (A -> B) unary_rules_freq = defaultdict(float) unary_rules_cnt_by_lhs = defaultdict(int) unary_rules_occur_cnt = 0 unary_lhs_set = set() unary_rhs_set = set()
def _parse(self, t): t = re.sub(r'(?u)\((CODE|ID)[^\)]*\)', '', t) if re.match(r'\s*\(\s*\)\s*$', t): return None return BracketParseCorpusReader._parse(self, t)
def get_raw_text_for_trees(treebank_root, splits, tree_files): lines = [] for fname in glob_raw_files(treebank_root, splits): with open(fname, 'r', encoding="windows-1252") as f: for line in f: if line.strip() and not line.startswith('.START'): # Delete invalid gcharacters caused by encoding issues line = line.replace("Õ", "").replace("å", "") lines.append(line) reader = BracketParseCorpusReader('.', tree_files) target_sents = reader.sents() line_iter = iter(lines) line = "" pairs = [] for target_sent in target_sents: if not line.strip(): line = next(line_iter) # Handle PTB-style escaping mismatches target_sent = [standardize_form(word) for word in target_sent] # Handle transpositions: sometimes the raw text transposes punctuation, # while the parsed version cleans up this transposition if 'U.S..' in ''.join(target_sent): target_sent = [x.replace('U.S.', 'U.S') for x in target_sent] if 'Co.,' in ''.join(target_sent) and 'Co,.' in line: target_sent = [x.replace('Co.', 'Co') for x in target_sent] if "But that 's" in ' '.join(target_sent) and "But's that" in line: target_sent = [x.replace("that", "tha") for x in target_sent] target_sent = [x.replace("'s", "t") for x in target_sent] if ('-- Freshman football player' in line or '-- Sophomore football player' in line or '-- Junior football player' in line or '-- Senior football player' in line or '-- Graduate-student football player' in line or '-- Football player' in line or '-- Freshman basketball player' in line or '-- Sophomore basketball player' in line or '-- Junior basketball player' in line or '-- Senior basketball player' in line or '-- Basketball player' in line) and ( '" .' in ' '.join(target_sent) and target_sent[-1] == '.'): target_sent = target_sent[:-1] # Attempt to align raw and parsed text r2p, p2r = tokenizations.get_alignments(line.replace("`", "'"), target_sent) # Handle skips: some lines in the raw data are not parsed while not all(p2r): go_next = False if line.startswith('(See') and '-- WSJ' in line: go_next = True elif line == 'San Diego ': go_next = True elif line == '" ': go_next = True if go_next: line = next(line_iter) r2p, p2r = tokenizations.get_alignments( line.replace("`", "'"), target_sent) else: break # Handle line breaks in raw format that come in the middle of the sentence # (such as mid-sentence line breaks in poems) for _ in range(12): # Loop limit is to aid in debugging if not all(p2r): line = line + next(line_iter) r2p, p2r = tokenizations.get_alignments( line.replace("`", "'"), target_sent) assert all(p2r) end = max([max(x) for x in p2r]) + 1 # Trim excess raw text at the start line_to_save = line[:end] r2p, p2r = tokenizations.get_alignments(line_to_save.replace("`", "'"), target_sent) while True: _, alt_p2r = tokenizations.get_alignments( '\n'.join(line_to_save.replace("`", "'").splitlines()[1:]), target_sent) if sum([len(x) for x in p2r]) == sum([len(x) for x in alt_p2r]): line_to_save = '\n'.join(line_to_save.splitlines()[1:]) else: break pairs.append((line_to_save, target_sent)) line = line[end:] assert len(pairs) == len(target_sents) return [line for (line, target_sent) in pairs]
def load_corpus(): corpus = BracketParseCorpusReader(root=parsed_path, fileids=file_names[1:]) return corpus
corpus_dev = data[int(s_total * p_train):int(s_total * (p_train + p_dev))] f_dev = codecs.open(TB_DEV, 'w', 'UTF-8') for s in corpus_dev: f_dev.write(u"{0}\n".format(s)) f_dev.close() corpus_test = data[int(s_total * (p_train + p_dev)):] f_test = codecs.open(TB_TEST, 'w', 'UTF-8') for s in corpus_test: f_test.write(u"{0}\n".format(s)) f_test.close() corpus_root = r"./corpus/" dev_file_pattern = r".*_dev\.tb" ptb_dev = BracketParseCorpusReader(corpus_root, dev_file_pattern) trees = ptb_dev.parsed_sents() f_out = codecs.open(TXT_DEV, 'w', 'UTF-8') for tree in trees: f_out.write(u"{0}\n".format(u" ".join(tree.leaves()))) f_out.close() test_file_pattern = r".*_test\.tb" ptb_test = BracketParseCorpusReader(corpus_root, test_file_pattern) trees = ptb_test.parsed_sents() f_out = codecs.open(TXT_TEST, 'w', 'UTF-8') for tree in trees: f_out.write(u"{0}\n".format(u" ".join(tree.leaves()))) f_out.close() print("Corpus size: %d" % s_total)
def pcfg_extraction(): ##################################################################### # Load Corpus treebanks # ##################################################################### treebank_train = BracketParseCorpusReader("", "sequoia_train.tb") ##################################################################### # Initialisation for unary, binary and terminal rules # ##################################################################### #Unary rules unary_freq = defaultdict(float) # How frequent is the rule A->B ? unary_cnt_by_lhs = defaultdict( int) # How many times A is the left part of a unary rule ? unary_occur_cnt = 0 # How many unary rules are there ? unary_lhs_set = set() # Set of left part symbols unary_rhs_set = set() # Set of right part symbols #binary rules binary_freq = defaultdict(float) binary_cnt_by_lhs = defaultdict(int) binary_occur_cnt = 0 binary_lhs_set = set() binary_rhs_set = set() #terminal rules postags_freq = defaultdict(float) postags_cnt_by_pos = defaultdict(int) postags_occur_cnt = 0 words_occur_cnt = defaultdict(int) postags_set = set() words_set = set() ##################################################################### # Parsing collection of rules and words # ##################################################################### for tree in treebank_train.parsed_sents(): t = tree.copy() t.chomsky_normal_form( horzMarkov=2 ) # Convert a tree into its Chomsky Normal Form equivalent prods = t.productions() # Get the recursive productions for prod in prods: left_symbol = prod.lhs().symbol() # Left hand side right_part = prod.rhs() # Right hand side if isinstance( right_part[0], str ): # Termination found : left side = part-of-speech tags right_symbol = right_part[0] #save it in terminal rules postags_freq[(left_symbol, right_symbol)] += 1 postags_cnt_by_pos[left_symbol] += 1 postags_occur_cnt += 1 words_occur_cnt[right_symbol] += 1 postags_set.add(left_symbol) words_set.add(right_symbol) else: if len(right_part) == 1: # Unary found right_symbol = right_part[0].symbol() #save it in unary rules unary_freq[(left_symbol, right_symbol)] += 1 unary_cnt_by_lhs[left_symbol] += 1 unary_occur_cnt += 1 unary_lhs_set.add(left_symbol) unary_rhs_set.add(right_symbol) elif len(right_part) == 2: # Binary found right_symbol = tuple([nt.symbol() for nt in right_part]) #save it in binary rules binary_freq[(left_symbol, right_symbol)] += 1 binary_cnt_by_lhs[left_symbol] += 1 binary_occur_cnt += 1 binary_lhs_set.add(left_symbol) binary_rhs_set.add(right_symbol) ##################################################################### # Look at the occurences of part-of-speech tags # ##################################################################### n_tag = len(words_occur_cnt.keys()) print('There are ' + str(n_tag) + ' different part-of-speech tags in the training set') plt.scatter([i for i in range(len(words_occur_cnt.keys()))], [words_occur_cnt[i] for i in words_occur_cnt.keys()]) plt.title('Occurences of part-of-speech tags') plt.xlabel('tag') plt.ylabel('occurence') plt.show() ##################################################################### # Group rare words into a new tag UNK # ##################################################################### # Replace rare words with '<UNKNOWN>' tag unfrequent = set([w for w in words_set if words_occur_cnt[w] < 2]) T_set = words_set.copy() T_set.difference_update(unfrequent) T_set.add(u"<UNKNOWN>") pw_pairs = list(postags_freq.keys()) for (pos, w) in pw_pairs: if w in unfrequent: postags_freq[(pos, u"<UNKNOWN>")] += postags_freq[(pos, w)] postags_freq.pop((pos, w)) ##################################################################### # Normalisation # ##################################################################### for (pos, w) in postags_freq: postags_freq[(pos, w)] /= postags_cnt_by_pos[pos] for (lhs, rhs) in unary_freq: unary_freq[(lhs, rhs)] /= (unary_cnt_by_lhs[lhs] + binary_cnt_by_lhs[lhs]) for (lhs, rhs) in binary_freq: binary_freq[(lhs, rhs)] /= (unary_cnt_by_lhs[lhs] + binary_cnt_by_lhs[lhs]) ##################################################################### # Save the results in files # ##################################################################### with codecs.open("PCFG_unary_freq.pkl", 'wb') as file: pickle.dump(unary_freq, file) file.close() with codecs.open("PCFG_binary_freq.pkl", 'wb') as file: pickle.dump(binary_freq, file) file.close() with codecs.open("PCFG_postags_freq.pkl", 'wb') as file: pickle.dump(postags_freq, file) file.close() ##################################################################### # rhs -> lhs dictionary # ##################################################################### unary_dict = {} binary_dict = {} postags_dict = {} for rhs in unary_rhs_set: unary_dict[rhs] = {} for (lhs, rhs) in unary_freq: unary_dict[rhs][lhs] = unary_freq[(lhs, rhs)] for rhs in binary_rhs_set: binary_dict[rhs] = {} for (lhs, rhs) in binary_freq: binary_dict[rhs][lhs] = binary_freq[(lhs, rhs)] for w in T_set: postags_dict[w] = {} for (pos, w) in postags_freq: postags_dict[w][pos] = postags_freq[(pos, w)] ##################################################################### # Save the results in files # ##################################################################### with codecs.open("PCFG_unary_dict.pkl", 'wb') as file: pickle.dump(unary_dict, file) file.close() with codecs.open("PCFG_binary_dict.pkl", 'wb') as file: pickle.dump(binary_dict, file) file.close() with codecs.open("PCFG_postags_dict.pkl", 'wb') as file: pickle.dump(postags_dict, file) file.close() ##################################################################### # the set of non-terminals and terminals # ##################################################################### # Store the set of non-terminals and terminals NT_set = unary_lhs_set.union(binary_lhs_set) with codecs.open("NT_set.pkl", 'wb') as file: pickle.dump(NT_set, file) file.close() with codecs.open("T_set.pkl", 'wb') as file: pickle.dump(T_set, file) file.close() with codecs.open("postags_set.pkl", 'wb') as file: pickle.dump(postags_set, file) file.close() with codecs.open("words_set.pkl", 'wb') as file: pickle.dump(words_set, file) file.close() return ()
def load_trees(const_path, text_path=None, text_processing="default"): """Load a treebank. The standard tree format presents an abstracted view of the raw text, with the assumption that a tokenizer and other early stages of the NLP pipeline have already been run. These can include formatting changes like escaping certain characters (e.g. -LRB-) or transliteration (see e.g. the Arabic and Hebrew SPMRL datasets). Tokens are not always delimited by whitespace, and the raw whitespace in the source text is thrown away in the PTB tree format. Moreover, in some treebanks the leaves of the trees are lemmas/stems rather than word forms. All of this is a mismatch for pre-trained transformer models, which typically do their own tokenization starting with raw unicode strings. A mismatch compared to pre-training often doesn't affect performance if you just want to report F1 scores within the same treebank, but it raises some questions when it comes to releasing a parser for general use: (1) Must the parser be integrated with a tokenizer that matches the treebank convention? In fact, many modern NLP libraries like spaCy train on dependency data that doesn't necessarily use the same tokenization convention as constituency treebanks. (2) Can the parser's pre-trained model be merged with other pre-trained system components (via methods like multi-task learning or adapters), or must it remain its own system because of tokenization mismatches? This tree-loading function aims to build a path towards parsing from raw text by using the `text_path` argument to specify an auxiliary file that can be used to recover the original unicode string for the text. Parser layers above the pre-trained model may still use gold tokenization during training, but this will possibly help make the parser more robust to tokenization mismatches. On the other hand, some benchmarks involve evaluating with gold tokenization, and naively switching to using raw text degrades performance substantially. This can hopefully be addressed by making the parser layers on top of the pre-trained transformers handle tokenization more intelligently, but this is still a work in progress and the option remains to use the data from the tree files with minimal processing controlled by the `text_processing` argument to clean up some escaping or transliteration. Args: const_path: Path to the file with one tree per line. text_path: (optional) Path to a file that provides the correct spelling for all tokens (without any escaping, transliteration, or other mangling) and information about whether there is whitespace after each token. Files in the CoNLL-U format (https://universaldependencies.org/format.html) are accepted, but the parser also accepts similarly-formatted files with just three fields (ID, FORM, MISC) instead of the usual ten. Text is recovered from the FORM field and any "SpaceAfter=No" annotations in the MISC field. text_processing: Text processing to use if no text_path is specified: - 'default': undo PTB-style escape sequences and attempt to guess whitespace surrounding punctuation - 'arabic': guess that all tokens are separated by spaces - 'arabic-translit': undo Buckwalter transliteration and guess that all tokens are separated by spaces - 'chinese': keep all tokens unchanged (i.e. do not attempt to find any escape sequences), and assume no whitespace between tokens - 'hebrew': guess that all tokens are separated by spaces - 'hebrew-translit': undo transliteration (see Sima'an et al. 2002) and guess that all tokens are separated by spaces Returns: A list of ParsingExample objects, which have the following attributes: - `tree` is an instance of nltk.Tree - `words` is a list of strings - `space_after` is a list of booleans """ reader = BracketParseCorpusReader("", [const_path]) trees = reader.parsed_sents() if text_path is not None: sents = read_text(text_path) elif text_processing in ("arabic-translit", "hebrew-translit"): translit = transliterate.TRANSLITERATIONS[ text_processing.replace("-translit", "") ] sents = [] for tree in trees: words = [translit(word) for word in tree.leaves()] sp_after = [True for _ in words] sents.append((words, sp_after)) elif text_processing in ("arabic", "hebrew"): sents = [] for tree in trees: words = tree.leaves() sp_after = [True for _ in words] sents.append((words, sp_after)) elif text_processing == "chinese": sents = [] for tree in trees: words = tree.leaves() sp_after = [False for _ in words] sents.append((words, sp_after)) elif text_processing == "default": sents = [] for tree in trees: words = ptb_unescape.ptb_unescape(tree.leaves()) sp_after = ptb_unescape.guess_space_after(tree.leaves()) sents.append((words, sp_after)) else: raise ValueError(f"Bad value for text_processing: {text_processing}") assert len(trees) == len(sents) treebank = Treebank( [ ParsingExample(tree=tree, words=words, space_after=space_after) for tree, (words, space_after) in zip(trees, sents) ] ) for example in treebank: assert len(example.words) == len(example.leaves()), ( "Constituency tree has a different number of tokens than the CONLL-U or " "other file used to specify reversible tokenization." ) return treebank
def data_preparation(): ##################################################################### # Load the Corpus # ##################################################################### data_file = codecs.open("sequoia-corpus+fct.mrg_strict", 'r', 'UTF-8') data = data_file.read().splitlines() N = len(data) #Ignore functional labels for i in range(N): compiled = re.compile(r"(?<=\()[A-Za-z_+^\-]+\-[^ ]+" ) # Compile a regular expression pattern data[i] = compiled.sub(lambda x: x.group().split('-')[0], data[i]) # reject - and what follows data_file.close() ##################################################################### # Training data # ##################################################################### # Select Training data and save it in sequoia_train.tb file corpusTrain = data[:int(N * 0.8)] #80% of data fileTrain = codecs.open("sequoia_train.tb", 'w', 'UTF-8') for s in corpusTrain: fileTrain.write(u"{0}\n".format(s)) fileTrain.close() # Read parenthesis-delineated parse trees # and save the natural sentences in sequoia_train.txt fileOut = codecs.open("sequoia_train.txt", 'w', 'UTF-8') for tree in BracketParseCorpusReader("", "sequoia_train.tb").parsed_sents(): fileOut.write(u"{0}\n".format(u" ".join(tree.leaves()))) fileOut.close() ##################################################################### # Development data # ##################################################################### # Select Developmen data and save it in sequoia_dev.tb file corpusDev = data[int(N * 0.8):int(N * 0.9)] #10% of data fileDev = codecs.open("sequoia_dev.tb", 'w', 'UTF-8') for s in corpusDev: fileDev.write(u"{0}\n".format(s)) fileDev.close() # Read parenthesis-delineated parse trees # and save the natural sentences in sequoia_train.txt fileOut = codecs.open(os.path.join("sequoia_dev.txt"), 'w', 'UTF-8') for tree in BracketParseCorpusReader("", "sequoia_dev.tb").parsed_sents(): fileOut.write(u"{0}\n".format(u" ".join(tree.leaves()))) fileOut.close() ##################################################################### # Testing data # ##################################################################### # Select Testing data and save it in sequoia_test.tb file corpusTest = data[int(N * 0.9):] fileTest = codecs.open("sequoia_test.tb", 'w', 'UTF-8') for s in corpusTest: fileTest.write(u"{0}\n".format(s)) fileTest.close() # Read parenthesis-delineated parse trees # and save the natural sentences in sequoia_train.txt fileOut = codecs.open("sequoia_test.txt", 'w', 'UTF-8') for tree in BracketParseCorpusReader("", "sequoia_test.tb").parsed_sents(): fileOut.write(u"{0}\n".format(u" ".join(tree.leaves()))) fileOut.close() fileTest2 = codecs.open("sequoia_test_tree.txt", 'w', 'UTF-8') for s in corpusTest: fileTest2.write(u"{0}\n".format(s[2:-1])) fileTest2.close() return ()
def __init__(self, root, fids='.*', tag_taxonomy=None, headship=None): self.corpus = BracketParseCorpusReader(root=root, fileids=fids) self._corpus_iter = (sent for sent in self.corpus.parsed_sents())
parser.add_argument('--inp', type=str, required=True, help='input file') parser.add_argument('--out', type=str, required=True, help='output file') parser.add_argument('--num_sent', type=int, default=10) args = parser.parse_args() use_pos_tags = True num_sent_per_doc = args.num_sent doc_id, sent_count, sent_offset, spanid = 1, 0, 0, 1 txt_file = bracket_file = ann_file = None directory, filename = os.path.split(args.inp) with open(args.inp, 'r') as fin: for sid, parse in tqdm( enumerate( BracketParseCorpusReader(root=directory, fileids=[filename ]).parsed_sents())): if txt_file is None: txt_file = open( os.path.join(args.out, '{}.txt'.format(doc_id)), 'w') bracket_file = open( os.path.join(args.out, '{}.bracket'.format(doc_id)), 'w') ann_file = open( os.path.join(args.out, '{}.ann'.format(doc_id)), 'w') strip_functional_tags(parse) # This is un-needed and clutters the label space. # All the trees also contain a root S node. if parse.label() == "VROOT" or parse.label() == "TOP": parse = parse[0] pos_tags = [x[1] for x in parse.pos()] if use_pos_tags else None
def _parse(self, t): t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t) if re.match(r"\s*\(\s*\)\s*$", t): return None return BracketParseCorpusReader._parse(self, t)
def get_words_and_whitespace(treebank_root, splits, tree_files): reader = BracketParseCorpusReader('.', tree_files) target_sents = reader.sents() raw_sents = get_raw_text_for_trees(treebank_root, splits, tree_files) pairs = [] for line, target_sent in zip(raw_sents, target_sents): # Fix some errors in the raw text that are also fixed in the parsed trees if "But's that just" in line: line = line.replace("But's that just", "But that's just") if 'Co,.' in line: line = line.replace('Co,.', 'Co.,') if 'U.S..' in ''.join(target_sent): # Address cases where underlying "U.S." got tokenized as "U.S." "."" # This is expected in the sentence-final position, but it seems to # occur in other places, too. line = line.replace('U.S.', 'U.S..').replace( 'U.S.. market', 'U.S. market').replace('U.S.. agenda', 'U.S. agenda').replace( 'U.S.. even', 'U.S. even').replace( 'U.S.. counterpart', 'U.S. counterpart').replace( 'U.S.. unit', 'U.S. unit').replace('U.S..,', 'U.S.,') words = target_sent[:] target_sent = [ standardize_form(word).replace("``", '"') for word in target_sent ] r2p, p2r = tokenizations.get_alignments(line.replace("`", "'"), target_sent) last_char_for_parsed = [max(x) if x else None for x in p2r] have_space_after = [None] * len(words) for i, word in enumerate(target_sent): if last_char_for_parsed[i] is None: continue char_after_word = line[last_char_for_parsed[i] + 1:last_char_for_parsed[i] + 2] have_space_after[i] = (char_after_word != char_after_word.lstrip()) # Fix the few cases where the word form in the parsed data is incorrect if word == "'T-" and target_sent[i + 1] == 'is': target_sent[i] = "'T" if word == "16" and target_sent[i + 1:i + 5] == [ '64', '-', 'inch', 'opening' ]: # This error occurs in the test set, and moreover would affect # tokenization by introducing an extra '/', so we don't fix it. # target_sent[i] = "16/" have_space_after[i] = True if word == "Gaming" and target_sent[i - 1:i + 2] == [ 'and', 'Gaming', 'company' ]: target_sent[i] = "gaming" pairs.append((target_sent, have_space_after)) # For each token in the treebank, we have now queried the raw string to # determine if the token should have whitespace following it. The lines # below are a sanity check that the reconstructed text matches the raw # version as closely as possible. to_delete = set() for indices in p2r: if not indices: continue to_delete |= set(range(min(indices), max(indices) + 1)) - set(indices) raw = list(line) for i in sorted(to_delete, reverse=True): del raw[i] raw = "".join(raw) raw = " ".join(x.strip() for x in raw.split()) guess = "".join([ w + (" " if sp else "") for (w, sp) in zip(target_sent, have_space_after) ]) if "filings policy-making" in guess: # The parsed version of this sentence drops an entire span from the raw # text. Maybe we shouldn't be training on this bad example, but for now # we'll just skip validating it. continue # Fix some issues with the raw text that are corrected in the parsed version raw = raw.replace("`", "'") raw = raw.replace("and <Tourism", "and Tourism") raw = raw.replace("staf reporter", "staff reporter") if " S$" in raw and " S$" not in guess: raw = raw.replace(" S$", " US$") raw = raw.replace("16/ 64-inch opening", "16 64-inch opening") if raw != guess and raw.replace('."', '".') == guess: raw = raw.replace('."', '".') # assert raw == guess if raw != guess: print(raw) print(guess) print() return pairs