def parse_filtered_sentences(source_dir, marker_set_tag): """ This function can be the same for each corpus :param source_dir: :param marker_set_tag: :return: """ markers_dir = pjoin(source_dir, "markers_" + marker_set_tag) input_dir = pjoin(markers_dir, "sentences") input_file_path = pjoin(input_dir, "{}.tsv".format(marker_set_tag)) output_dir = pjoin(markers_dir, "parsed_sentence_pairs") if not os.path.exists(markers_dir): raise Exception("{} does not exist".format(markers_dir)) if not os.path.exists(input_dir): raise Exception("{} does not exist".format(input_dir)) if not os.path.exists(input_file_path): raise Exception("{} does not exist".format(input_file_path)) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("setting up parser (actually just testing atm)") setup_corenlp() # parsed_sentence_pairs = {marker: {"s1": [], "s2": []} for marker in discourse_markers} with open( pjoin(output_dir, "{}_parsed_sentence_pairs.txt".format(marker_set_tag)), 'a') as w: with open(input_file_path, 'rb') as f: logger.info("reading {}".format(input_file_path)) i = 0 for line in f: sentence, previous, marker = line[:-1].split("\t") i += 1 if i > 0: #try: parsed_output = dependency_parsing(sentence, previous, marker) if parsed_output: s1, s2 = parsed_output line_to_print = "{}\t{}\t{}\n".format(s1, s2, marker) w.write(line_to_print) #except: # print i, marker, sentence if i % args.filter_print_every == 0: logger.info("processed {}".format(i)) #stop #logger.info("total sentences: {}".format( # sum([len(sentences[marker]["sentence"]) for marker in sentences]) #)) logger.info('file writing complete')
def parse_filtered_sentences(source_dir, filenames, marker_set_tag, discourse_markers): """ This function can be the same for each corpus :param source_dir: :param filenames: :param marker_set_tag: :param discourse_markers: :return: """ markers_dir = pjoin(source_dir, "markers_" + marker_set_tag) input_dir = pjoin(markers_dir, "sentences") input_file_path = pjoin(input_dir, "{}.json".format(marker_set_tag)) output_dir = pjoin(markers_dir, "parsed_sentence_pairs") if not os.path.exists(markers_dir): raise Exception("{} does not exist".format(markers_dir)) if not os.path.exists(input_dir): raise Exception("{} does not exist".format(input_dir)) if not os.path.exists(input_file_path): raise Exception("{} does not exist".format(input_file_path)) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("setting up parser (actually just testing atm)") setup_corenlp() # parsed_sentence_pairs = {marker: {"s1": [], "s2": []} for marker in discourse_markers} with open( pjoin(output_dir, "{}_parsed_sentence_pairs.txt".format(marker_set_tag)), 'a') as w: # header = "{}\t{}\t{}\n".format("s1", "s2", "marker") # w.write(header) with open(input_file_path, 'rb') as f: logger.info("reading {}".format(input_file_path)) sentences = json.load(f) logger.info("total sentences: {}".format( sum([ len(sentences[marker]["sentence"]) for marker in sentences ]))) for marker, slists in sentences.iteritems(): i = 0 if marker in discourse_markers: # if marker == "because": for sentence, previous in set( zip(slists["sentence"], slists["previous"])): i += 1 if True: parsed_output = dependency_parsing( sentence, previous, marker) if parsed_output: s1, s2 = parsed_output line_to_print = "{}\t{}\t{}\n".format( s1, s2, marker) w.write(line_to_print) if i % args.filter_print_every == 0: logger.info("processed {}".format(i)) logger.info('file writing complete')
for item in test_items: # if i < n_tests: output = depparse_ssplit(item["sentence"], item["previous_sentence"], item["marker"], lang) if output: output = "[\"" + "\", \"".join(list(output)) + "\"]" try: assert (output == item["output"]) except AssertionError: print("====== TEST FAILED ======" + "\nsentence: " + item["sentence"] + "\nmarker: " + item["marker"] + "\nactual output: " + str(output) + "\ndesired output: " + str(item["output"])) failures += 1 # else: # print("====================") # print(item["sentence"]) # output = depparse_ssplit(item["sentence"], item["previous_sentence"], item["marker"]) # print(output) # i += 1 if failures == 0: print("All tests passed.") if __name__ == '__main__': args = setup_args() setup_corenlp(args.lang) test(args.lang)
marker) if parsed_output: s1, s2 = parsed_output line_to_print = "{}\t{}\t{}\n".format(s1, s2, marker) w.write(line_to_print) #except: # print i, marker, sentence if i % args.filter_print_every == 0: logger.info("processed {}".format(i)) #stop #logger.info("total sentences: {}".format( # sum([len(sentences[marker]["sentence"]) for marker in sentences]) #)) logger.info('file writing complete') def dependency_parsing(sentence, previous_sentence, marker): return depparse_ssplit(sentence, previous_sentence, marker, lang='sp') if __name__ == '__main__': if args.extract: extrat_raw_gigaword() elif args.filter: collect_raw_sentences(gigaword_sp_dir, [gigaword_sp_file], "ALL", SP_DISCOURSE_MARKERS) elif args.parse: setup_corenlp("sp") parse_filtered_sentences(gigaword_sp_dir, "ALL")
def parse_filtered_sentences(source_dir, marker_set_tag): """ This function can be the same for each corpus :param source_dir: :param marker_set_tag: :return: """ markers_dir = pjoin(source_dir, "markers_" + marker_set_tag) input_dir = pjoin(markers_dir, "sentences") input_file_path = pjoin(input_dir, "{}.json".format(marker_set_tag)) output_dir = pjoin(markers_dir, "parsed_sentence_pairs") if not os.path.exists(markers_dir): raise Exception("{} does not exist".format(markers_dir)) if not os.path.exists(input_dir): raise Exception("{} does not exist".format(input_dir)) if not os.path.exists(input_file_path): raise Exception("{} does not exist".format(input_file_path)) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("setting up parser (actually just testing atm)") setup_corenlp() # parsed_sentence_pairs = {marker: {"s1": [], "s2": []} for marker in discourse_markers} with open( pjoin(output_dir, "{}_parsed_sentence_pairs.txt".format(marker_set_tag)), 'a') as w: with open(input_file_path, 'rb') as f: logger.info("reading {}".format(input_file_path)) sentences = json.load(f) # resume training only on "而" if args.exclude_list: exclusion_list = [u'虽然', u'可是', u'不过', u'所以', u'但', u'因此'] logger.info("excluded: {}".format(exclusion_list)) # we take them out from the sentences dictionary # those markers have finished parsing for ex_marker in exclusion_list: del sentences[ex_marker] logger.info("total sentences: {}".format( sum([ len(sentences[marker]["sentence"]) for marker in sentences ]))) i = 0 for marker, slists in sentences.iteritems(): for sentence, previous in zip(slists["sentence"], slists["previous"]): i += 1 if i > 0: # add an argument try: parsed_output = dependency_parsing( sentence, previous, marker) if parsed_output: s1, s2 = parsed_output line_to_print = "{}\t{}\t{}\n".format( s1, s2, marker) w.write(line_to_print) except: print i, marker, sentence if i % args.filter_print_every == 0: logger.info("processed {}".format(i)) logger.info('file writing complete')
try: parsed_output = dependency_parsing( sentence, previous, marker) if parsed_output: s1, s2 = parsed_output line_to_print = "{}\t{}\t{}\n".format( s1, s2, marker) w.write(line_to_print) except: print i, marker, sentence if i % args.filter_print_every == 0: logger.info("processed {}".format(i)) logger.info('file writing complete') def dependency_parsing(sentence, previous_sentence, marker): return depparse_ssplit(sentence, previous_sentence, marker, lang='ch') if __name__ == '__main__': if args.extract: extrat_raw_gigaword() elif args.filter: collect_raw_sentences(gigaword_cn_dir, [gigaword_cn_file], "ALL14", CH_DISCOURSE_MARKERS) elif args.parse: setup_corenlp("ch") parse_filtered_sentences(gigaword_cn_dir, "ALL14")
import nltk from seq2seq import PhenomenonEncoder, L2EDecoder from parser import setup_corenlp, get_parse, Sentence import spacy import nltk import uuid import pickle from onmt.model_builder import build_base_model from onmt import inputters from pattern.en import conjugate, lemma, lexeme, PRESENT, PAST, PL, SG setup_corenlp("en") nlp = spacy.load("en_core_web_sm") def replacements(parse, subs): old_string = subs["old"] new_string = subs["new"] pos_type = subs["pos"] indices_to_replace = [ t["index"] for t in parse.new_tokens if t["word"] == old_string and t["pos"][0] == pos_type ] for i in indices_to_replace: parse.new_tokens[i - 1]["word"] = new_string parse.new_tokens[i - 2]["after"] = " " sentence = "".join(t["word"] + t["after"]