def parse_filtered_sentences(source_dir, marker_set_tag):
    """
    This function can be the same for each corpus

    :param source_dir:
    :param marker_set_tag:
    :return:
    """

    markers_dir = pjoin(source_dir, "markers_" + marker_set_tag)
    input_dir = pjoin(markers_dir, "sentences")
    input_file_path = pjoin(input_dir, "{}.tsv".format(marker_set_tag))
    output_dir = pjoin(markers_dir, "parsed_sentence_pairs")

    if not os.path.exists(markers_dir):
        raise Exception("{} does not exist".format(markers_dir))
    if not os.path.exists(input_dir):
        raise Exception("{} does not exist".format(input_dir))
    if not os.path.exists(input_file_path):
        raise Exception("{} does not exist".format(input_file_path))

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    logger.info("setting up parser (actually just testing atm)")
    setup_corenlp()

    # parsed_sentence_pairs = {marker: {"s1": [], "s2": []} for marker in discourse_markers}
    with open(
            pjoin(output_dir,
                  "{}_parsed_sentence_pairs.txt".format(marker_set_tag)),
            'a') as w:

        with open(input_file_path, 'rb') as f:
            logger.info("reading {}".format(input_file_path))
            i = 0
            for line in f:
                sentence, previous, marker = line[:-1].split("\t")
                i += 1
                if i > 0:
                    #try:
                    parsed_output = dependency_parsing(sentence, previous,
                                                       marker)
                    if parsed_output:
                        s1, s2 = parsed_output
                        line_to_print = "{}\t{}\t{}\n".format(s1, s2, marker)
                        w.write(line_to_print)
                #except:
                #  print i, marker, sentence
                if i % args.filter_print_every == 0:
                    logger.info("processed {}".format(i))
                #stop
            #logger.info("total sentences: {}".format(
            #    sum([len(sentences[marker]["sentence"]) for marker in sentences])
            #))

    logger.info('file writing complete')
Exemple #2
0
def parse_filtered_sentences(source_dir, filenames, marker_set_tag,
                             discourse_markers):
    """
    This function can be the same for each corpus

    :param source_dir:
    :param filenames:
    :param marker_set_tag:
    :param discourse_markers:
    :return:
    """

    markers_dir = pjoin(source_dir, "markers_" + marker_set_tag)
    input_dir = pjoin(markers_dir, "sentences")
    input_file_path = pjoin(input_dir, "{}.json".format(marker_set_tag))
    output_dir = pjoin(markers_dir, "parsed_sentence_pairs")

    if not os.path.exists(markers_dir):
        raise Exception("{} does not exist".format(markers_dir))
    if not os.path.exists(input_dir):
        raise Exception("{} does not exist".format(input_dir))
    if not os.path.exists(input_file_path):
        raise Exception("{} does not exist".format(input_file_path))

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    logger.info("setting up parser (actually just testing atm)")
    setup_corenlp()

    # parsed_sentence_pairs = {marker: {"s1": [], "s2": []} for marker in discourse_markers}
    with open(
            pjoin(output_dir,
                  "{}_parsed_sentence_pairs.txt".format(marker_set_tag)),
            'a') as w:
        # header = "{}\t{}\t{}\n".format("s1", "s2", "marker")
        # w.write(header)

        with open(input_file_path, 'rb') as f:
            logger.info("reading {}".format(input_file_path))
            sentences = json.load(f)
            logger.info("total sentences: {}".format(
                sum([
                    len(sentences[marker]["sentence"]) for marker in sentences
                ])))
            for marker, slists in sentences.iteritems():
                i = 0
                if marker in discourse_markers:
                    # if marker == "because":
                    for sentence, previous in set(
                            zip(slists["sentence"], slists["previous"])):
                        i += 1
                        if True:
                            parsed_output = dependency_parsing(
                                sentence, previous, marker)
                            if parsed_output:
                                s1, s2 = parsed_output

                                line_to_print = "{}\t{}\t{}\n".format(
                                    s1, s2, marker)
                                w.write(line_to_print)

                            if i % args.filter_print_every == 0:
                                logger.info("processed {}".format(i))

    logger.info('file writing complete')
Exemple #3
0
    for item in test_items:
        # if i < n_tests:
        output = depparse_ssplit(item["sentence"], item["previous_sentence"],
                                 item["marker"], lang)
        if output:
            output = "[\"" + "\", \"".join(list(output)) + "\"]"
        try:
            assert (output == item["output"])
        except AssertionError:
            print("====== TEST FAILED ======" + "\nsentence: " +
                  item["sentence"] + "\nmarker: " + item["marker"] +
                  "\nactual output: " + str(output) + "\ndesired output: " +
                  str(item["output"]))
            failures += 1
    # else:
    #     print("====================")
    #     print(item["sentence"])
    #     output = depparse_ssplit(item["sentence"], item["previous_sentence"], item["marker"])
    #     print(output)
    # i += 1

    if failures == 0:
        print("All tests passed.")


if __name__ == '__main__':
    args = setup_args()
    setup_corenlp(args.lang)
    test(args.lang)
                                                       marker)
                    if parsed_output:
                        s1, s2 = parsed_output
                        line_to_print = "{}\t{}\t{}\n".format(s1, s2, marker)
                        w.write(line_to_print)
                #except:
                #  print i, marker, sentence
                if i % args.filter_print_every == 0:
                    logger.info("processed {}".format(i))
                #stop
            #logger.info("total sentences: {}".format(
            #    sum([len(sentences[marker]["sentence"]) for marker in sentences])
            #))

    logger.info('file writing complete')


def dependency_parsing(sentence, previous_sentence, marker):
    return depparse_ssplit(sentence, previous_sentence, marker, lang='sp')


if __name__ == '__main__':
    if args.extract:
        extrat_raw_gigaword()
    elif args.filter:
        collect_raw_sentences(gigaword_sp_dir, [gigaword_sp_file], "ALL",
                              SP_DISCOURSE_MARKERS)
    elif args.parse:
        setup_corenlp("sp")
        parse_filtered_sentences(gigaword_sp_dir, "ALL")
Exemple #5
0
def parse_filtered_sentences(source_dir, marker_set_tag):
    """
    This function can be the same for each corpus

    :param source_dir:
    :param marker_set_tag:
    :return:
    """

    markers_dir = pjoin(source_dir, "markers_" + marker_set_tag)
    input_dir = pjoin(markers_dir, "sentences")
    input_file_path = pjoin(input_dir, "{}.json".format(marker_set_tag))
    output_dir = pjoin(markers_dir, "parsed_sentence_pairs")

    if not os.path.exists(markers_dir):
        raise Exception("{} does not exist".format(markers_dir))
    if not os.path.exists(input_dir):
        raise Exception("{} does not exist".format(input_dir))
    if not os.path.exists(input_file_path):
        raise Exception("{} does not exist".format(input_file_path))

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    logger.info("setting up parser (actually just testing atm)")
    setup_corenlp()

    # parsed_sentence_pairs = {marker: {"s1": [], "s2": []} for marker in discourse_markers}
    with open(
            pjoin(output_dir,
                  "{}_parsed_sentence_pairs.txt".format(marker_set_tag)),
            'a') as w:

        with open(input_file_path, 'rb') as f:
            logger.info("reading {}".format(input_file_path))
            sentences = json.load(f)

            # resume training only on "而"
            if args.exclude_list:
                exclusion_list = [u'虽然', u'可是', u'不过', u'所以', u'但', u'因此']
                logger.info("excluded: {}".format(exclusion_list))

                # we take them out from the sentences dictionary
                # those markers have finished parsing
                for ex_marker in exclusion_list:
                    del sentences[ex_marker]

            logger.info("total sentences: {}".format(
                sum([
                    len(sentences[marker]["sentence"]) for marker in sentences
                ])))
            i = 0
            for marker, slists in sentences.iteritems():
                for sentence, previous in zip(slists["sentence"],
                                              slists["previous"]):
                    i += 1
                    if i > 0:  # add an argument
                        try:
                            parsed_output = dependency_parsing(
                                sentence, previous, marker)
                            if parsed_output:
                                s1, s2 = parsed_output
                                line_to_print = "{}\t{}\t{}\n".format(
                                    s1, s2, marker)
                                w.write(line_to_print)
                        except:
                            print i, marker, sentence

                        if i % args.filter_print_every == 0:
                            logger.info("processed {}".format(i))

    logger.info('file writing complete')
Exemple #6
0
                        try:
                            parsed_output = dependency_parsing(
                                sentence, previous, marker)
                            if parsed_output:
                                s1, s2 = parsed_output
                                line_to_print = "{}\t{}\t{}\n".format(
                                    s1, s2, marker)
                                w.write(line_to_print)
                        except:
                            print i, marker, sentence

                        if i % args.filter_print_every == 0:
                            logger.info("processed {}".format(i))

    logger.info('file writing complete')


def dependency_parsing(sentence, previous_sentence, marker):
    return depparse_ssplit(sentence, previous_sentence, marker, lang='ch')


if __name__ == '__main__':
    if args.extract:
        extrat_raw_gigaword()
    elif args.filter:
        collect_raw_sentences(gigaword_cn_dir, [gigaword_cn_file], "ALL14",
                              CH_DISCOURSE_MARKERS)
    elif args.parse:
        setup_corenlp("ch")
        parse_filtered_sentences(gigaword_cn_dir, "ALL14")
Exemple #7
0
import nltk

from seq2seq import PhenomenonEncoder, L2EDecoder
from parser import setup_corenlp, get_parse, Sentence
import spacy
import nltk

import uuid
import pickle

from onmt.model_builder import build_base_model
from onmt import inputters

from pattern.en import conjugate, lemma, lexeme, PRESENT, PAST, PL, SG

setup_corenlp("en")
nlp = spacy.load("en_core_web_sm")


def replacements(parse, subs):
    old_string = subs["old"]
    new_string = subs["new"]
    pos_type = subs["pos"]
    indices_to_replace = [
        t["index"] for t in parse.new_tokens
        if t["word"] == old_string and t["pos"][0] == pos_type
    ]
    for i in indices_to_replace:
        parse.new_tokens[i - 1]["word"] = new_string
        parse.new_tokens[i - 2]["after"] = " "
    sentence = "".join(t["word"] + t["after"]