def segment_and_parse(doc_dict, syntax_parser, segmenter, rst_parser): ''' A method to perform syntax parsing, discourse segmentation, and RST parsing as necessary, given a partial document dictionary. See `convert_rst_discourse_tb.py` for details about document dictionaries. ''' # Return empty lists if the input was blank. # (Check whether raw_text is available so this does not crash # when evaluating on pre-parsed treebank documents.) if 'raw_text' in doc_dict and not doc_dict['raw_text'].strip(): # TODO add a unit test for this. logging.warning('The input contained no non-whitespace characters.' + ' doc_id = {}'.format(doc_dict["doc_id"])) return [], [] if 'syntax_trees' not in doc_dict: # Do syntactic parsing. trees, starts_paragraph_list = \ syntax_parser.parse_document(doc_dict) doc_dict['syntax_trees'] = [t.pprint(margin=TREE_PRINT_MARGIN) for t in trees] preterminals = [extract_preterminals(t) for t in trees] doc_dict['token_tree_positions'] = [[x.treeposition() for x in preterminals_sentence] for preterminals_sentence in preterminals] doc_dict['tokens'] = [extract_converted_terminals(t) for t in trees] doc_dict['pos_tags'] = [[x.label() for x in preterminals_sentence] for preterminals_sentence in preterminals] if 'edu_start_indices' not in doc_dict: # Do discourse segmentation. segmenter.segment_document(doc_dict) # Extract whether each EDU starts a paragraph. edu_starts_paragraph = [] for tree_idx, tok_idx, _ in doc_dict['edu_start_indices']: val = (tok_idx == 0 and starts_paragraph_list[tree_idx]) edu_starts_paragraph.append(val) assert len(edu_starts_paragraph) == len(doc_dict['edu_start_indices']) doc_dict['edu_starts_paragraph'] = edu_starts_paragraph # Extract a list of lists of (word, POS) tuples. edu_tokens = extract_edus_tokens(doc_dict['edu_start_indices'], doc_dict['tokens']) # Do RST parsing. rst_parse_trees = rst_parser.parse(doc_dict) return edu_tokens, rst_parse_trees
def segment_and_parse(doc_dict, syntax_parser, segmenter, rst_parser): ''' A method to perform syntax parsing, discourse segmentation, and RST parsing as necessary, given a partial document dictionary. See `convert_rst_discourse_tb.py` for details about document dictionaries. ''' # Return empty lists if the input was blank. # (Check whether raw_text is available so this does not crash # when evaluating on pre-parsed treebank documents.) if 'raw_text' in doc_dict and not doc_dict['raw_text'].strip(): # TODO add a unit test for this. logging.warning('The input contained no non-whitespace characters.' + ' doc_id = {}'.format(doc_dict["doc_id"])) return [], [] if 'syntax_trees' not in doc_dict: # Do syntactic parsing. trees, starts_paragraph_list = \ syntax_parser.parse_document(doc_dict) doc_dict['syntax_trees'] = [ t.pformat(margin=TREE_PRINT_MARGIN) for t in trees ] preterminals = [extract_preterminals(t) for t in trees] doc_dict['token_tree_positions'] = [[ x.treeposition() for x in preterminals_sentence ] for preterminals_sentence in preterminals] doc_dict['tokens'] = [extract_converted_terminals(t) for t in trees] doc_dict['pos_tags'] = [[x.label() for x in preterminals_sentence] for preterminals_sentence in preterminals] if 'edu_start_indices' not in doc_dict: # Do discourse segmentation. segmenter.segment_document(doc_dict) # Extract whether each EDU starts a paragraph. edu_starts_paragraph = [] for tree_idx, tok_idx, _ in doc_dict['edu_start_indices']: val = (tok_idx == 0 and starts_paragraph_list[tree_idx]) edu_starts_paragraph.append(val) assert len(edu_starts_paragraph) == len(doc_dict['edu_start_indices']) doc_dict['edu_starts_paragraph'] = edu_starts_paragraph # Extract a list of lists of (word, POS) tuples. edu_tokens = extract_edus_tokens(doc_dict['edu_start_indices'], doc_dict['tokens']) # Do RST parsing. rst_parse_trees = rst_parser.parse(doc_dict) return edu_tokens, rst_parse_trees
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_path', help='crf++ model file created by ' + 'tune_segmentation_model.py.') parser.add_argument('input_path', help='document text file') parser.add_argument('-zp', '--zpar_port', type=int) parser.add_argument('-zh', '--zpar_hostname', default=None) args = parser.parse_args() raw_text = read_text_file(args.input_path) doc_dict = {"doc_id": args.input_path, "raw_text": raw_text} parser = SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname) trees, _ = parser.parse_document(doc_dict) tokens_doc = [extract_converted_terminals(tree) for tree in trees] preterminals = [extract_preterminals(tree) for tree in trees] token_tree_positions = [[x.treeposition() for x in preterminals_sentence] for preterminals_sentence in preterminals] pos_tags = [[x.label() for x in preterminals_sentence] for preterminals_sentence in preterminals] doc_dict["tokens"] = tokens_doc doc_dict["syntax_trees"] = [ t.pformat(margin=TREE_PRINT_MARGIN) for t in trees ] doc_dict["token_tree_positions"] = token_tree_positions doc_dict["pos_tags"] = pos_tags segmenter = Segmenter(args.model_path) segmenter.segment_document(doc_dict) edu_token_lists = extract_edus_tokens(doc_dict['edu_start_indices'], tokens_doc) for edu_tokens in edu_token_lists: print(' '.join(edu_tokens))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_path', help='crf++ model file created by ' + 'tune_segmentation_model.py.') parser.add_argument('input_path', help='document text file') parser.add_argument('-zp', '--zpar_port', type=int) parser.add_argument('-zh', '--zpar_hostname', default=None) args = parser.parse_args() raw_text = read_text_file(args.input_path) doc_dict = {"doc_id": args.input_path, "raw_text": raw_text} parser = SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname) trees, _ = parser.parse_document(doc_dict) tokens_doc = [extract_converted_terminals(tree) for tree in trees] preterminals = [extract_preterminals(tree) for tree in trees] token_tree_positions = [[x.treeposition() for x in preterminals_sentence] for preterminals_sentence in preterminals] pos_tags = [[x.label() for x in preterminals_sentence] for preterminals_sentence in preterminals] doc_dict["tokens"] = tokens_doc doc_dict["syntax_trees"] = [t.pprint(TREE_PRINT_MARGIN) for t in trees] doc_dict["token_tree_positions"] = token_tree_positions doc_dict["pos_tags"] = pos_tags segmenter = Segmenter(args.model_path) segmenter.segment_document(doc_dict) edu_token_lists = extract_edus_tokens(doc_dict['edu_start_indices'], tokens_doc) for edu_tokens in edu_token_lists: print(' '.join(edu_tokens))
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('rst_discourse_tb_dir', help='directory for the RST Discourse Treebank. \ This should have a subdirectory \ data/RSTtrees-WSJ-main-1.0.') parser.add_argument('ptb_dir', help='directory for the Penn Treebank. This should \ have a subdirectory parsed/mrg/wsj.') parser.add_argument('--output_dir', help='directory where the output JSON files go.', default='.') args = parser.parse_args() logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=logging.INFO) logging.warning( " Warnings related to minor issues that are difficult to resolve " + " will be logged for the following files: " + " file1.edus, file5.edus, wsj_0678.out.edus, and wsj_2343.out.edus." + " Multiple warnings 'not enough syntax trees' will be produced" + " because the RSTDTB has footers that are not in the PTB (e.g.," + " indicating where a story is written). Also, there are some loose" + " match warnings because of differences in formatting between" + " treebanks.") for dataset in ['TRAINING', 'TEST']: logging.info(dataset) outputs = [] for path_index, path in enumerate( sorted(glob(os.path.join(args.rst_discourse_tb_dir, 'data', 'RSTtrees-WSJ-main-1.0', dataset, '*.edus')))): path_basename = os.path.basename(path) # if path_basename in file_mapping: # # skip the not-so-well-formatted files "file1" to "file5" # continue tokens_doc = [] edu_start_indices = [] logging.info('{} {}'.format(path_index, path_basename)) ptb_id = (file_mapping[path_basename] if path_basename in file_mapping else path_basename)[:-9] ptb_path = os.path.join(args.ptb_dir, 'parsed', 'mrg', 'wsj', ptb_id[4:6], '{}.mrg'.format(ptb_id)) with open(ptb_path) as f: doc = re.sub(r'\s+', ' ', f.read()).strip() trees = [ParentedTree.fromstring('( ({}'.format(x)) for x in re.split(r'\(\s*\(', doc) if x] for t in trees: convert_ptb_tree(t) with open(path) as f: edus = [line.strip() for line in f.readlines()] path_outfile = path[:-5] path_dis = "{}.dis".format(path_outfile) with open(path_dis) as f: rst_tree_str = f.read().strip() rst_tree_str = fix_rst_treebank_tree_str(rst_tree_str) rst_tree_str = convert_parens_in_rst_tree_str(rst_tree_str) rst_tree = ParentedTree.fromstring(rst_tree_str) reformat_rst_tree(rst_tree) # Identify which EDUs are at the beginnings of paragraphs. edu_starts_paragraph = [] with open(path_outfile) as f: outfile_doc = f.read().strip() paragraphs = re.split(r'\n\n+', outfile_doc) # Filter out paragraphs that don't include a word character. paragraphs = [x for x in paragraphs if re.search(r'\w', x)] # Remove extra nonword characters to make alignment easier # (to avoid problems with the minor discrepancies that exist # in the two versions of the documents.) paragraphs = [re.sub(r'\W', r'', p.lower()) for p in paragraphs] p_idx = -1 paragraph = "" for edu_index, edu in enumerate(edus): logging.debug('edu: {}, paragraph: {}, p_idx: {}' .format(edu, paragraph, p_idx)) edu = re.sub(r'\W', r'', edu.lower()) starts_paragraph = False crossed_paragraphs = False while len(paragraph) < len(edu): assert not crossed_paragraphs or starts_paragraph starts_paragraph = True p_idx += 1 paragraph += paragraphs[p_idx] if len(paragraph) < len(edu): crossed_paragraphs = True logging.warning( 'A paragraph is split across trees.' + ' doc: {}, chars: {}, EDU: {}' .format(path_basename, paragraphs[p_idx:p_idx + 2], edu)) assert paragraph.index(edu) == 0 logging.debug('edu_starts_paragraph = {}' .format(starts_paragraph)) edu_starts_paragraph.append(starts_paragraph) paragraph = paragraph[len(edu):].strip() assert p_idx == len(paragraphs) - 1 if sum(edu_starts_paragraph) != len(paragraphs): logging.warning(('The number of sentences that start a' + ' paragraph is not equal to the number' + ' of paragraphs. This is probably due' + ' to trees being split across' + ' paragraphs. doc: {}') .format(path_basename)) edu_index = -1 tok_index = 0 tree_index = 0 edu = "" tree = trees[0] tokens_doc = [extract_converted_terminals(t) for t in trees] tokens = tokens_doc[0] preterminals = [extract_preterminals(t) for t in trees] while edu_index < len(edus) - 1: # if we are out of tokens for the sentence we are working # with, move to the next sentence. if tok_index >= len(tokens): tree_index += 1 if tree_index >= len(trees): logging.warning(('Not enough syntax trees for {},' + ' probably because the RSTDB' + ' contains a footer that is not in' + ' the PTB. The remaining EDUs will' + ' be automatically tagged.') .format(path_basename)) unparsed_edus = ' '.join(edus[edu_index + 1:]) # The tokenizer splits '---' into '--' '-'. # This is a hack to get around that. unparsed_edus = re.sub(r'---', '--', unparsed_edus) for tagged_sent in \ [nltk.pos_tag(convert_paren_tokens_to_ptb_format( \ TreebankWordTokenizer().tokenize(x))) for x in nltk.sent_tokenize(unparsed_edus)]: new_tree = ParentedTree.fromstring('((S {}))' \ .format(' '.join(['({} {})'.format(tag, word) for word, tag in tagged_sent]))) trees.append(new_tree) tokens_doc.append( extract_converted_terminals(new_tree)) preterminals.append(extract_preterminals(new_tree)) tree = trees[tree_index] tokens = tokens_doc[tree_index] tok_index = 0 tok = tokens[tok_index] # if edu is the empty string, then the previous edu was # completed by the last token, # so this token starts the next edu. if not edu: edu_index += 1 edu = edus[edu_index] edu = re.sub(r'>\s*', r'', edu).replace('&', '&') edu = re.sub(r'---', r'--', edu) edu = edu.replace('. . .', '...') # annoying edge cases if path_basename == 'file1.edus': edu = edu.replace('founded by', 'founded by his grandfather.') elif (path_basename == 'wsj_0660.out.edus' or path_basename == 'wsj_1368.out.edus' or path_basename == "wsj_1371.out.edus"): edu = edu.replace('S.p. A.', 'S.p.A.') elif path_basename == 'wsj_1329.out.edus': edu = edu.replace('G.m.b. H.', 'G.m.b.H.') elif path_basename == 'wsj_1367.out.edus': edu = edu.replace('-- that turban --', '-- that turban') elif path_basename == 'wsj_1377.out.edus': edu = edu.replace('Part of a Series', 'Part of a Series }') elif path_basename == 'wsj_1974.out.edus': edu = edu.replace(r'5/ 16', r'5/16') elif path_basename == 'file2.edus': edu = edu.replace('read it into the record,', 'read it into the record.') elif path_basename == 'file3.edus': edu = edu.replace('about $to $', 'about $2 to $4') elif path_basename == 'file5.edus': # There is a PTB error in wsj_2172.mrg: # The word "analysts" is missing from the parse. # It's gone without a trace :-/ edu = edu.replace('panic among analysts', 'panic among') edu = edu.replace('his bid Oct. 17', 'his bid Oct. 5') edu = edu.replace('his bid on Oct. 17', 'his bid on Oct. 5') edu = edu.replace('to commit $billion,', 'to commit $3 billion,') edu = edu.replace('received $million in fees', 'received $8 million in fees') edu = edu.replace('`` in light', '"in light') edu = edu.replace('3.00 a share', '2 a share') edu = edu.replace(" the Deal.", " the Deal.'") edu = edu.replace("' Why doesn't", "Why doesn't") elif path_basename == 'wsj_1331.out.edus': edu = edu.replace('`S', "'S") elif path_basename == 'wsj_1373.out.edus': edu = edu.replace('... An N.V.', 'An N.V.') edu = edu.replace('features.', 'features....') elif path_basename == 'wsj_1123.out.edus': edu = edu.replace('" Reuben', 'Reuben') edu = edu.replace('subscribe to.', 'subscribe to."') elif path_basename == 'wsj_2317.out.edus': edu = edu.replace('. The lower', 'The lower') edu = edu.replace('$4 million', '$4 million.') elif path_basename == 'wsj_1376.out.edus': edu = edu.replace('Elizabeth.', 'Elizabeth.\'"') edu = edu.replace('\'" In', 'In') elif path_basename == 'wsj_1105.out.edus': # PTB error: a sentence starts with an end quote. # For simplicity, we'll just make the # EDU string look like the PTB sentence. edu = edu.replace('By lowering prices', '"By lowering prices') edu = edu.replace(' 70% off."', ' 70% off.') elif path_basename == 'wsj_1125.out.edus': # PTB error: a sentence ends with an start quote. edu = edu.replace('developer.', 'developer."') edu = edu.replace('"So developers', 'So developers') elif path_basename == 'wsj_1158.out.edus': edu = re.sub(r'\s*\-$', r'', edu) # PTB error: a sentence starts with an end quote. edu = edu.replace(' virtues."', ' virtues.') edu = edu.replace('So much for', '"So much for') elif path_basename == 'wsj_0632.out.edus': # PTB error: a sentence starts with an end quote. edu = edu.replace(' individual.', ' individual."') edu = edu.replace('"If there ', 'If there ') elif path_basename == 'wsj_2386.out.edus': # PTB error: a sentence starts with an end quote. edu = edu.replace('lenders."', 'lenders.') edu = edu.replace('Mr. P', '"Mr. P') elif path_basename == 'wsj_1128.out.edus': # PTB error: a sentence ends with an start quote. edu = edu.replace('it down.', 'it down."') edu = edu.replace('"It\'s a real"', "It's a real") elif path_basename == 'wsj_1323.out.edus': # PTB error (or at least a very unusual edge case): # "--" ends a sentence. edu = edu.replace('-- damn!', 'damn!') edu = edu.replace('from the hook', 'from the hook --') elif path_basename == 'wsj_2303.out.edus': # PTB error: a sentence ends with an start quote. edu = edu.replace('Simpson in an interview.', 'Simpson in an interview."') edu = edu.replace('"Hooker\'s', 'Hooker\'s') # wsj_2343.out.edus also has an error that can't be easily # fixed: and EDU spans 2 sentences, ("to analyze what..."). if edu_start_indices \ and tree_index - edu_start_indices[-1][0] > 1: logging.warning(("SKIPPED A TREE. file = {}" + " tree_index = {}," + " edu_start_indices[-1][0] = {}," + " edu index = {}") .format(path_basename, tree_index, edu_start_indices[-1][0], edu_index)) edu_start_indices.append((tree_index, tok_index, edu_index)) # remove the next token from the edu, along with any whitespace if edu.startswith(tok): edu = edu[len(tok):].strip() elif (re.search(r'[^a-zA-Z0-9]', edu[0]) and edu[1:].startswith(tok)): logging.warning(("loose match: tok = {}, " + "remainder of EDU: {}").format(tok, edu)) edu = edu[len(tok) + 1:].strip() else: m_tok = re.search(r'^[^a-zA-Z ]+$', tok) m_edu = re.search(r'^[^a-zA-Z ]+(.*)', edu) if not m_tok or not m_edu: raise Exception(('\n\npath_index: {}\ntok: {}\n' + 'edu: {}\nfull_edu:{}\nleaves:' + '{}\n\n').format(path_index, tok, edu, edus[edu_index], tree.leaves())) logging.warning("loose match: {} ==> {}".format(tok, edu)) edu = m_edu.groups()[0].strip() tok_index += 1 output = {"doc_id": ptb_id, "path_basename": path_basename, "tokens": tokens_doc, "edu_strings": edus, "syntax_trees": [t.pprint(margin=TREE_PRINT_MARGIN) for t in trees], "token_tree_positions": [[x.treeposition() for x in preterminals_sentence] for preterminals_sentence in preterminals], "pos_tags": [[x.label() for x in preterminals_sentence] for preterminals_sentence in preterminals], "edu_start_indices": edu_start_indices, "rst_tree": rst_tree.pprint(margin=TREE_PRINT_MARGIN), "edu_starts_paragraph": edu_starts_paragraph} assert len(edu_start_indices) == len(edus) assert len(edu_starts_paragraph) == len(edus) # check that the EDUs match up edu_tokens = extract_edus_tokens(edu_start_indices, tokens_doc) for edu_index, (edu, edu_token_list) \ in enumerate(zip(edus, edu_tokens)): edu_nospace = re.sub(r'\s+', '', edu).lower() edu_tokens_nospace = ''.join(edu_token_list).lower() distance = nltk.metrics.distance.edit_distance( edu_nospace, edu_tokens_nospace) if distance > 4: logging.warning(("EDIT DISTANCE > 3 IN {}: " + "edu string = {}, edu tokens = {}, " + "edu idx = {}") .format(path_basename, edu, edu_token_list, edu_index)) if not re.search(r'[A-Za-z0-9]', edu_tokens_nospace): logging.warning(("PUNCTUATION-ONLY EDU IN {}: " + "edu tokens = {}, edu idx = {}") .format(path_basename, edu_token_list, edu_index)) outputs.append(output) with open(os.path.join(args.output_dir, ('rst_discourse_tb_edus_' + '{}.json').format(dataset)), 'w') as outfile: json.dump(outputs, outfile)