예제 #1
0
def test_date_check():
    parse = "(S (NP (NNP CARL ) (NN XVI ) (NNP GUSTAF ) ) )"

    test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEGOV"]

    test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate( "19720813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEELI"]

    test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate("19010813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEELI"]
예제 #2
0
    def check_date(self, match):
        """
        Method for resolving date restrictions on actor codes. 
        
        Parameters
        -----------
        match: list
               Dates and codes from the dictionary
        
        Returns
        -------
        code: string
              The code corresponding to how the actor should be coded given the date
        """

        code = None
        try:

            for j in match:
                dates = j[1]
                date = []
                code = ""
                for d in dates:
                    if d[0] in "<>":
                        date.append(d[0] + str(PETRreader.dstr_to_ordate(d[1:])))
                    else:
                        date.append(str(PETRreader.dstr_to_ordate(d)))
                curdate = self.date
                if not date:
                    code = j[0]
                elif len(date) == 1:
                    if date[0][0] == "<":
                        if curdate < int(date[0][1:]):
                            code = j[0]
                    else:
                        if curdate >= int(date[0][1:]):
                            code = j[0]
                else:
                    if curdate < int(date[1]):
                        if curdate >= int(date[0]):
                            code = j[0]

                if code:
                    return code
        except Exception as e:
            # print(e)
            return code

        return code
def test_reflexive():
    parse = "(S (NP (NNP Obama ) )  (VP (VBD asked ) (NP (PRP himself ) )  (SBAR (WHADVP (WRB why ) ) (S (NP (NNP Biden ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) )".upper(
    )

    test = ptree.Sentence(parse, "Obama asked himself why Biden was tired",
                          PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1]
    assert phrase.get_meaning() == ["USAGOV"]
def test_personal1():
    parse = "(S (NP (NNP Obama ) ) (VP (VBD said ) (SBAR (S (NP (PRP he ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) ) ".upper(
    )

    test = ptree.Sentence(parse, "Obama said he was tired",
                          PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1].children[0].children[0]
    assert phrase.get_meaning() == ["USAGOV"]
def test_date_check():
    parse = "(S (NP (NNP CARL ) (NN XVI ) (NNP GUSTAF ) ) )"

    test = ptree.Sentence(parse, "Carl XVI Gustaf",
                          PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEGOV"]

    test = ptree.Sentence(parse, "Carl XVI Gustaf",
                          PETRreader.dstr_to_ordate("19720813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEELI"]

    test = ptree.Sentence(parse, "Carl XVI Gustaf",
                          PETRreader.dstr_to_ordate("19010813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEELI"]
def test_reflexive2():
    parse = "(S (NP (NNP Obama ) ) (VP (VBD knew ) (SBAR (IN that ) (S (NP (NNP Putin ) ) (VP (VBD liked ) (NP (PRP himself ) ) ) ) ) )  ) ".upper(
    )

    test = ptree.Sentence(parse, "Obama knew that Biden liked him",
                          PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1].children[1].children[
        1].children[1]
    assert phrase.get_meaning() == ["RUSGOV"]
예제 #7
0
def do_coding(event_dict, out_file):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""

    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    if out_file:
        file = open_tex(out_file)

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0
    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []

        SkipStory = False
        print('\n\nProcessing {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        StorySource = 'TEMP'
        for sent in val['sents']:
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                if 'config' in val['sents'][sent]:
                    for id, config in event_dict[key][
                            'sents'][sent]['config'].items():
                        change_Config_Options(config)

                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(SentenceDate)
                SentenceSource = 'TEMP'
                
                #if not "SYNSET" in SentenceID:
                #    continue
                #if not "Sarkozy" in SentenceText:
                #    continue
                print("\t\t",SentenceID)
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed
                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = True
                        NDiscardStory += 1
                        break
                
                
                t1 = time.time()
                sentence = PETRtree.Sentence(treestr,SentenceText,Date)
                coded_events , meta = sentence.get_events()
                code_time = time.time()-t1
                event_dict[key]['meta']['verbs'] = meta

                if out_file:
                    sentence.print_to_file(sentence.tree,file = file)

                
                del(sentence)
                times+=code_time
                sents += 1
                print('\t\t',code_time)
                
                
                if coded_events:
                    event_dict[key]['sents'][sent]['events'] = coded_events
                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len(input("Press Enter to continue...")) > 0:
                        sys.exit()

                prev_code = coded_events
                NEvents += len(coded_events)
                if len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None


    if out_file:
        close_tex(file)
    

    print("\nSummary:")
    print(
        "Stories read:",
        NStory,
        "   Sentences coded:",
        NSent,
        "  Events generated:",
        NEvents)
    print(
        "Discards:  Sentence",
        NDiscardSent,
        "  Story",
        NDiscardStory,
        "  Sentences without events:",
        NEmpty)
    print("Average Coding time = ", times/sents if sents else 0)
    return event_dict
예제 #8
0
def do_coding(event_dict):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""
    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0

    #获得发布时间
    realiseTimeDic = get_releasetime(event_dict)

    if not realiseTimeDic:
        print(
            "realiseTimeDic have no timeinfo ,please check “get_releasetime” method"
        )
    #获得报道时间
    reporttimeDic = get_reporttime(event_dict, realiseTimeDic)

    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []
        SkipStory = False
        print('\n\nProcessing paragraph {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        if StoryDate == 'NULL':
            continue

        id = key.split("-")
        articleId = id[0]
        paraghId = id[1]

        #设置发布时间与报道时间,报道时间缺失的按发布时间确定
        val["meta"]["realiseTime"] = realiseTimeDic[articleId]
        if articleId in reporttimeDic.keys():
            val["meta"]["reportTime"] = reporttimeDic[articleId]
        else:
            val["meta"]["reportTime"] = realiseTimeDic[articleId]

        # if paraghId == "0000":
        #     with open("timeinfo.txt", "a") as f:
        #
        #         f.writelines(("发布时间:" + val["meta"]["realiseTime"]).decode("utf-8").encode("utf-8") + "\n")
        #         f.writelines(("报道时间:" + val["meta"]["reportTime"]).decode("utf-8").encode("utf-8") + "\n")
        # with open("timeinfo.txt", "a") as f:
        #     f.writelines(("文章段落ID:" + articleId + " " + paraghId + "\n").decode("utf-8").encode("utf-8"))

        for sent in sorted(val['sents']):
            print('\n\nProcessing sentence {}'.format(sent))
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][
                        sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(
                    SentenceDate.split(' ')[0].replace('-', ''))
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed
                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = False
                        NDiscardStory += 1
                        break

                t1 = time.time()
                try:
                    sentence = PETRtree.Sentence(treestr, SentenceText, Date)

                except Exception as e:

                    message = "ERROR IN PETRARCH2 DO_CODING:" + SentenceID + "\n" + SentenceText + str(
                        e) + "\n"
                    logging.exception(message)
                    continue
                set_nt_textList(sentence)

                set_sentenceTimeByReport(sentence, val["meta"]["reportTime"],
                                         val['sents'], sent)

                # with open("timeinfo.txt", "a") as f:
                #     f.writelines(("     句子ID:" + sent + "\n").decode("utf-8").encode("utf-8"))
                #     f.write("       "+sentence.txt.decode("utf-8").encode("utf-8")+ "\n")
                #     f.write("       时间词列表: ")
                #     for text in sentence.ntTextList:
                #         f.write(text+",")
                #     f.write("\n       句子时间:" +str(sentence.sentenceTime).decode("utf-8").encode("utf-8") + "\n\n")
                timeText = sentence.ntTextList
                sentenceTime = sentence.sentenceTime
                try:
                    coded_events, meta = sentence.get_events()
                except Exception as e:
                    message = "ERROR IN PETRARCH2 DO_CODING:" + SentenceID + "\n" + SentenceText + str(
                        e) + "\n"
                    logging.exception(message)

                # print("coded_events:",coded_events)
                # print("meta:",meta)

                #print("coded_events:",coded_events)
                #print("meta:",meta)
                # exit()

                # 暂时只走了最后一条分支
                code_time = time.time() - t1
                if PETRglobals.NullVerbs or PETRglobals.NullActors:
                    event_dict[key]['meta'] = meta
                    event_dict[key]['text'] = sentence.txt
                elif PETRglobals.NullActors:
                    event_dict[key]['events'] = coded_events
                    coded_events = None  # skips additional processing
                    event_dict[key]['text'] = sentence.txt
                else:
                    # 16.04.30 pas: we're using the key value 'meta' at two
                    # very different
                    event_dict[key]['meta']['verbs'] = meta
                    # levels of event_dict -- see the code about ten lines below -- and
                    # this is potentially confusing, so it probably would be useful to
                    # change one of those
                del (sentence)

                times += code_time
                sents += 1
                # print('\t\t',code_time)

                if coded_events:
                    event_dict[key]['sents'][sent]['events'] = coded_events
                    event_dict[key]['sents'][sent]['meta'] = meta
                    #print('DC-events:', coded_events) # --
                    #print('DC-meta:', meta) # --
                    #print('+++',event_dict[key]['sents'][sent])  # --
                    if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot:
                        text_dict = utilities.extract_phrases(
                            event_dict[key]['sents'][sent], SentenceID)
                        print('DC-td1:', text_dict)  # --
                        if text_dict:
                            event_dict[key]['sents'][sent]['meta'][
                                'actortext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'eventtext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'actorroot'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'eventroot'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'Source'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'Target'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'timeText'] = timeText
                            event_dict[key]['sents'][sent]['meta'][
                                'sentenceTime'] = {sentenceTime}
                            # --                            print('DC1:',text_dict) # --
                            for evt in coded_events:
                                # realLocation = []
                                # location_initial = event_dict[key]['sents'][sent]['ner']
                                #
                                # index1 = SentenceText.find(text_dict[evt][0]) + 1
                                # index2 = SentenceText.find(text_dict[evt][1]) - 1
                                # index3 = SentenceText.find(text_dict[evt][2]) - 1
                                # for loc in location_initial:
                                #     if (SentenceText.find(loc, index1, index2)
                                #             or SentenceText.find(loc, index1, index3)):
                                #         realLocation.append(loc)
                                # event_dict[key]['sents'][sent]['ner'] = realLocation

                                if evt in text_dict:  # 16.04.30 pas bypasses problems with expansion of compounds
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actortext'][evt] = text_dict[evt][:2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'eventtext'][evt] = text_dict[evt][2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actorroot'][evt] = text_dict[evt][3:5]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'eventroot'][evt] = text_dict[evt][5]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'Source'][evt] = text_dict[evt][0]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'Target'][evt] = text_dict[evt][1]

                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len(input("Press Enter to continue...")) > 0:
                        sys.exit()

                prev_code = coded_events
                # NEvents += len(coded_events)
                if coded_events is not None and len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None

    print("\nSummary:")
    """
    print(
        "Stories read:",
        NStory,
        "   Sentences coded:",
        NSent,
        "  Events generated:",
        NEvents)
    print(
        "Discards:  Sentence",
        NDiscardSent,
        "  Story",
        NDiscardStory,
        "  Sentences without events:",
        NEmpty)
    print("Average Coding time = ", times / sents if sents else 0)
    """
    # --    print('DC-exit:',event_dict)
    return event_dict
예제 #9
0
def do_coding(event_dict, out_file):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""

    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    if out_file:
        file = open_tex(out_file)

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0
    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []

        SkipStory = False
        print('\n\nProcessing story {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        StorySource = 'TEMP'
        for sent in val['sents']:
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                #                 if 'config' in val['sents'][sent]:
                #                     for id, config in event_dict[key][
                #                             'sents'][sent]['config'].items():
                #                         change_Config_Options(config)

                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][
                        sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(SentenceDate)
                SentenceSource = 'TEMP'

                print("\n", SentenceID)
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed
                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = True
                        NDiscardStory += 1
                        break

                t1 = time.time()
                sentence = PETRtree.Sentence(treestr, SentenceText, Date)
                print(sentence.actor)
                print(sentence.agent)
                coded_events, meta = sentence.get_events(
                )  # this is the entry point into the processing in PETRtree
                #                print(meta)
                code_time = time.time() - t1
                event_dict[key]['meta'][
                    'verbs'] = meta  # 16.04.30 pas: we're using the key value 'meta' at two very different
                # levels of event_dict -- see the code about ten lines below -- and
                # this is potentially confusing, so it probably would be useful to
                # change one of those

                if out_file:
                    sentence.print_to_file(sentence.tree, file=file)

                del (sentence)
                times += code_time
                sents += 1
                #print('\t\t',code_time)

                if coded_events:
                    event_dict[key]['sents'][sent]['events'] = coded_events
                    event_dict[key]['sents'][sent]['meta'] = meta
                    """print('DC-events:', coded_events) # --
                    print('DC-meta:', meta) # --
                    print('+++',event_dict[key]['sents'][sent])  # --"""
                    if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot:
                        text_dict = utilities.extract_phrases(
                            event_dict[key]['sents'][sent], SentenceID)
                        # --                        print('DC-td1:',text_dict) # --
                        if text_dict:
                            event_dict[key]['sents'][sent]['meta'][
                                'actortext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'eventtext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'actorroot'] = {}
                            # --                            print('DC1:',text_dict) # --
                            for evt in coded_events:
                                if evt in text_dict:  # 16.04.30 pas bypasses problems with expansion of compounds
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actortext'][evt] = text_dict[evt][:2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'eventtext'][evt] = text_dict[evt][2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actorroot'][evt] = text_dict[evt][3:5]

                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len(input("Press Enter to continue...")) > 0:
                        sys.exit()

                prev_code = coded_events
                NEvents += len(coded_events)
                if len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None

    if out_file:
        close_tex(file)

    print("\nSummary:")
    print("Stories read:", NStory, "   Sentences coded:", NSent,
          "  Events generated:", NEvents)
    print("Discards:  Sentence", NDiscardSent, "  Story", NDiscardStory,
          "  Sentences without events:", NEmpty)
    print("Average Coding time = ", times / sents if sents else 0)
    # --    print('DC-exit:',event_dict)
    return event_dict
예제 #10
0
def do_coding(event_dict):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""

    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0
    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []

        SkipStory = False
        print('\n\nProcessing story {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        for sent in val['sents']:
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                if 'config' in val['sents'][sent]:
                    for _, config in event_dict[key]['sents'][sent][
                            'config'].items():
                        change_Config_Options(config)

                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][
                        sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(SentenceDate)

                print("\n", SentenceID)
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed

                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = True
                        NDiscardStory += 1
                        break

                t1 = time.time()
                sentence = PETRgraph.Sentence(treestr, SentenceText, Date)
                print(sentence.txt)
                #raw_input("check")
                # this is the entry point into the processing in PETRtree
                coded_events = sentence.get_events()

                event_dict[key]['sents'][sent]['events'] = sentence.events
                event_dict[key]['sents'][sent]['verbs'] = sentence.verbs
                event_dict[key]['sents'][sent]['nouns'] = sentence.nouns
                event_dict[key]['sents'][sent]['triplets'] = sentence.triplets

                logger.debug("check events:")
                for eventID, event in event_dict[key]['sents'][sent][
                        'events'].items():
                    logger.debug("event:" + eventID)
                    logger.debug(event)

                #raw_input("Press Enter to continue...")
                code_time = time.time() - t1
                '''
                if PETRglobals.NullVerbs or PETRglobals.NullActors:
                    event_dict[key]['meta'] = meta
                    event_dict[key]['text'] = sentence.txt
                elif PETRglobals.NullActors:
                    event_dict[key]['events'] = coded_events
                    coded_events = None   # skips additional processing
                    event_dict[key]['text'] = sentence.txt
                else:
                    # 16.04.30 pas: we're using the key value 'meta' at two
                    # very different
                    event_dict[key]['meta']['verbs'] = meta
                    # levels of event_dict -- see the code about ten lines below -- and
                    # this is potentially confusing, so it probably would be useful to
                    # change one of those
                '''

                del (sentence)
                times += code_time
                sents += 1
                # print('\t\t',code_time)

                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len(input("Press Enter to continue...")) > 0:
                        sys.exit()

                prev_code = coded_events
                NEvents += len(coded_events.values())
                if len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None

    print("\nSummary:")
    print("Stories read:", NStory, "   Sentences coded:", NSent,
          "  Events generated:", NEvents)
    print("Discards:  Sentence", NDiscardSent, "  Story", NDiscardStory,
          "  Sentences without events:", NEmpty)
    print("Average Coding time = ", times / sents if sents else 0)
    # --    print('DC-exit:',event_dict)
    return event_dict
예제 #11
0
    def check_date(self, match):
        """
		Method for resolving date restrictions on actor codes.

		Parameters
		-----------
		match: list
		       Dates and codes from the dictionary

		Returns
		-------
		code: string
		      The code corresponding to how the actor should be coded given the date


		Note <16.06.10 pas>
		-------------------
		In a very small set of cases involving a reflexive PRP inside a PP, the system can get into an infinite
		recursion where it first backs up a couple levels from the (PP, then this call to child.get_meaning() drops
		back down to the same point via the two child invocations in NounPhrase.get_meaning()

		            elif child.label == "PP":
		                m = self.resolve_codes(child.get_meaning())

		and in PrepPhrase.get_meaning()

		            self.meaning = self.children[1].get_meaning() if isinstance(self.children[1],NounPhrase) else ""

		which takes one back to the same point at one deeper level of recursion. These structures occurred about five times
		in a 20M sentence corpus, and I couldn't find any fix that didn't break something else, so I just trapped it
		here.

		There are a bunch of commented-out debugging prints remaining from this futile pursuit that could presumably be
		removed at some point.

		The full record for one of the offending cases is:

		<Sentence date = "20150824" id ="e35ef55a-fa30-4c34-baae-965dea33d8d8_3" source = "ANOTHER INFINITE RECURSION" sentence = "True">
		<Text>
		He started out at the bottom of the Hollywood rung, directed his own movie and managed to get noticed by Steven
		Spielberg himself to nab a tiny role in 1998s Saving Private Ryan .
		</Text>
		<Parse>
		(ROOT (S (S (NP (PRP He))
		(VP (VBD started) (PRT (RP out))
		(PP (IN at)
		(NP (NP (DT the) (NN bottom))
		(PP (IN of) (NP (DT the) (NNP Hollywood) ))))))
		(VP (VBD rung))
		(, ,)
		(S (VP
		(VP (VBD directed) (NP (PRP$ his) (JJ own) (NN movie))) (CC and)
		(VP (VBD managed) (S
		(VP (TO to)
		(VP (VB get)
		    (VP (VBN noticed)
		    (PP (IN by)
		        (NP (NNP Steven) (NNP Spielberg) (PRP himself))
		    )
		    (S  (VP (TO to)  (VP (VB nab)
		            (NP (NP (DT a) (JJ tiny) (NN role))
		            (PP (IN in)
		                (NP (NP (NNS 1998s))  (VP (VBG Saving)  (NP (JJ Private) (NNP Ryan))
		            ))))))))))))))
		(. .)))
		</Parse>
		</Sentence>

		"""

        code = None
        #try:
        for j in match:
            dates = j[1]
            date = []
            code = ""
            for d in dates:
                if d[0] in '<>':
                    date.append(d[0] + str(PETRreader.dstr_to_ordate(d[1:])))
                else:
                    date.append(str(PETRreader.dstr_to_ordate(d)))

            curdate = self.date

            if not date:
                code = j[0]
            elif len(date) == 1:
                if date[0][0] == '<':
                    if curdate < int(date[0][1:]):
                        code = j[0]
                else:
                    if curdate >= int(date[0][1:]):
                        code = j[0]
            else:
                if curdate < int(date[1]):
                    if curdate >= int(date[0]):
                        code = j[0]

            if code:
                return code

        #except Exception as e:
        # print(e)
        #	return code

        return code
예제 #12
0
def do_coding(event_dict, out_file):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""

    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    """if out_file:  # <16.06.18 pas> disable for now
        file = open_tex(out_file)"""

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0
    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []

        SkipStory = False
        print('\n\nProcessing story {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        StorySource = 'TEMP'
        for sent in val['sents']:
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                if 'config' in val['sents'][sent]:
                    for id, config in event_dict[key][
                            'sents'][sent]['config'].items():
                        change_Config_Options(config)

                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(SentenceDate)
                SentenceSource = 'TEMP'
                
                print("\n",SentenceID)
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed
                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = True
                        NDiscardStory += 1
                        break
                
                
                t1 = time.time()
                sentence = PETRtree.Sentence(treestr,SentenceText,Date)
                print(sentence.txt)
                coded_events , meta = sentence.get_events()  # this is the entry point into the processing in PETRtree
                code_time = time.time()-t1
                if PETRglobals.NullVerbs or PETRglobals.NullActors:
                    event_dict[key]['meta'] = meta
                    event_dict[key]['text'] = sentence.txt                    
                elif PETRglobals.NullActors:
                    event_dict[key]['events'] = coded_events
                    coded_events = None   # skips additional processing
                    event_dict[key]['text'] = sentence.txt                    
                else:
                    event_dict[key]['meta']['verbs'] = meta # 16.04.30 pas: we're using the key value 'meta' at two very different
                                                        # levels of event_dict -- see the code about ten lines below -- and 
                                                        # this is potentially confusing, so it probably would be useful to  
                                                        # change one of those 

                """if out_file: # <16.06.18 pas> This isn't doing anything useful right now, just flipping bits on the hard drive, so I'm disabling it  
                    sentence.print_to_file(sentence.tree,file = file)"""
                
                del(sentence)
                times+=code_time
                sents += 1
                #print('\t\t',code_time)
                
                
                if coded_events:
                    event_dict[key]['sents'][sent]['events'] = coded_events
                    event_dict[key]['sents'][sent]['meta'] = meta  
                    """print('DC-events:', coded_events) # --
                    print('DC-meta:', meta) # --
                    print('+++',event_dict[key]['sents'][sent])  # --"""
                    if  PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot :
                        text_dict = utilities.extract_phrases(event_dict[key]['sents'][sent],SentenceID)
# --                        print('DC-td1:',text_dict) # --
                        if text_dict:
                            event_dict[key]['sents'][sent]['meta']['actortext'] = {}
                            event_dict[key]['sents'][sent]['meta']['eventtext'] = {}
                            event_dict[key]['sents'][sent]['meta']['actorroot'] = {}
# --                            print('DC1:',text_dict) # --
                            for evt in coded_events:
                                if evt in text_dict: # 16.04.30 pas bypasses problems with expansion of compounds 
                                    event_dict[key]['sents'][sent]['meta']['actortext'][evt] = text_dict[evt][:2]
                                    event_dict[key]['sents'][sent]['meta']['eventtext'][evt] = text_dict[evt][2]
                                    event_dict[key]['sents'][sent]['meta']['actorroot'][evt] = text_dict[evt][3:5]

                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len(input("Press Enter to continue...")) > 0:
                        sys.exit()

                prev_code = coded_events
                NEvents += len(coded_events)
                if len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None


    """if out_file:  # <16.06.18 pas> disable for now
        close_tex(file)"""
    

    print("\nSummary:")
    print(
        "Stories read:",
        NStory,
        "   Sentences coded:",
        NSent,
        "  Events generated:",
        NEvents)
    print(
        "Discards:  Sentence",
        NDiscardSent,
        "  Story",
        NDiscardStory,
        "  Sentences without events:",
        NEmpty)
    print("Average Coding time = ", times/sents if sents else 0)
# --    print('DC-exit:',event_dict)
    return event_dict
예제 #13
0
def do_coding(event_dict):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""
    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0

    # path = ""
    # dirs = os.listdir(path)
    # for file in dirs:
    #     if file == 'evts.test.txt':
    #         os.remove(path+file)

    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []

        SkipStory = False
        print('\n\nProcessing story {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        for sent in val['sents']:
            print("sent:", sent)
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                if 'config' in val['sents'][sent]:
                    for _, config in event_dict[key]['sents'][sent][
                            'config'].items():
                        change_Config_Options(config)

                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][
                        sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(SentenceDate)

                print("\n", SentenceID)
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed
                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = True
                        NDiscardStory += 1
                        break

                t1 = time.time()
                sentence = PETRtree.Sentence(treestr, SentenceText, Date)
                print(sentence.txt)
                # this is the entry point into the processing in PETRtree
                coded_events, meta = sentence.get_events()
                # print("coded_events:",coded_events)
                # print("meta:",meta)

                print("coded_events:", coded_events)
                #print("meta:",meta)
                # exit()


#                 code_time = time.time() - t1
#                 if PETRglobals.NullVerbs or PETRglobals.NullActors:
#                     event_dict[key]['meta'] = meta
#                     event_dict[key]['text'] = sentence.txt
#                 elif PETRglobals.NullActors:
#                     event_dict[key]['events'] = coded_events
#                     coded_events = None   # skips additional processing
#                     event_dict[key]['text'] = sentence.txt
#                 else:
#                     # 16.04.30 pas: we're using the key value 'meta' at two
#                     # very different
#                     event_dict[key]['meta']['verbs'] = meta
#                     # levels of event_dict -- see the code about ten lines below -- and
#                     # this is potentially confusing, so it probably would be useful to
#                     # change one of those
#
#                 del(sentence)
#                 times += code_time
#                 sents += 1
#                 # print('\t\t',code_time)
#
#                 if coded_events:
#                     event_dict[key]['sents'][sent]['events'] = coded_events
#                     event_dict[key]['sents'][sent]['meta'] = meta
#                     #print('DC-events:', coded_events) # --
#                     #print('DC-meta:', meta) # --
#                     #print('+++',event_dict[key]['sents'][sent])  # --
#                     if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot:
#                         text_dict = utilities.extract_phrases(event_dict[key]['sents'][sent], SentenceID)
#                         print('DC-td1:',text_dict) # --
#                         if text_dict:
#                             event_dict[key]['sents'][sent][
#                                 'meta']['actortext'] = {}
#                             event_dict[key]['sents'][sent][
#                                 'meta']['eventtext'] = {}
#                             event_dict[key]['sents'][sent][
#                                 'meta']['actorroot'] = {}
# # --                            print('DC1:',text_dict) # --
#                             for evt in coded_events:
#                                 if evt in text_dict:  # 16.04.30 pas bypasses problems with expansion of compounds
#                                     event_dict[key]['sents'][sent]['meta'][
#                                         'actortext'][evt] = text_dict[evt][:2]
#                                     event_dict[key]['sents'][sent]['meta'][
#                                         'eventtext'][evt] = text_dict[evt][2]
#                                     event_dict[key]['sents'][sent]['meta'][
#                                         'actorroot'][evt] = text_dict[evt][3:5]
#
#                 if coded_events and PETRglobals.IssueFileName != "":
#                     event_issues = get_issues(SentenceText)
#                     if event_issues:
#                         event_dict[key]['sents'][sent]['issues'] = event_issues
#
#                 if PETRglobals.PauseBySentence:
#                     if len(input("Press Enter to continue...")) > 0:
#                         sys.exit()
#
#                 prev_code = coded_events
#                 # NEvents += len(coded_events)
#                 if len(coded_events) == 0:
#                     NEmpty += 1
#             else:
#                 logger.info('{} has no parse information. Passing.'.format(SentenceID))
#                 pass
#
#         if SkipStory:
#             event_dict[key]['sents'] = None
#
#     print("\nSummary:")
    """
    print(
        "Stories read:",
        NStory,
        "   Sentences coded:",
        NSent,
        "  Events generated:",
        NEvents)
    print(
        "Discards:  Sentence",
        NDiscardSent,
        "  Story",
        NDiscardStory,
        "  Sentences without events:",
        NEmpty)
    print("Average Coding time = ", times / sents if sents else 0)
    """
    # --    print('DC-exit:',event_dict)
    return event_dict
예제 #14
0
def test_reflexive2():
    parse = "(S (NP (NNP Obama ) ) (VP (VBD knew ) (SBAR (IN that ) (S (NP (NNP Putin ) ) (VP (VBD liked ) (NP (PRP himself ) ) ) ) ) )  ) ".upper()

    test = ptree.Sentence(parse,"Obama knew that Biden liked him",PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1].children[1].children[1].children[1]
    assert phrase.get_meaning() == ["RUSGOV"]
예제 #15
0
def test_reflexive():
    parse = "(S (NP (NNP Obama ) )  (VP (VBD asked ) (NP (PRP himself ) )  (SBAR (WHADVP (WRB why ) ) (S (NP (NNP Biden ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) )".upper()

    test = ptree.Sentence(parse,"Obama asked himself why Biden was tired",PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1]
    assert phrase.get_meaning() == ["USAGOV"]
예제 #16
0
def test_personal1():
    parse = "(S (NP (NNP Obama ) ) (VP (VBD said ) (SBAR (S (NP (PRP he ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) ) ".upper()

    test = ptree.Sentence(parse,"Obama said he was tired",PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1].children[0].children[0]
    assert phrase.get_meaning() == ["USAGOV"]