コード例 #1
0
ファイル: petrarch2.py プロジェクト: langstok/petrarch2
def read_dictionaries(validation=False):

    print('Verb dictionary:', PETRglobals.VerbFileName)
    verb_path = utilities._get_data('data/dictionaries',
                                    PETRglobals.VerbFileName)
    PETRreader.read_verb_dictionary(verb_path)

    print('Actor dictionaries:', PETRglobals.ActorFileList)
    for actdict in PETRglobals.ActorFileList:
        actor_path = utilities._get_data('data/dictionaries', actdict)
        PETRreader.read_actor_dictionary(actor_path)

    print('Agent dictionary:', PETRglobals.AgentFileName)
    agent_path = utilities._get_data('data/dictionaries',
                                     PETRglobals.AgentFileName)
    PETRreader.read_agent_dictionary(agent_path)

    print('Discard dictionary:', PETRglobals.DiscardFileName)
    discard_path = utilities._get_data('data/dictionaries',
                                       PETRglobals.DiscardFileName)
    PETRreader.read_discard_list(discard_path)

    if PETRglobals.IssueFileName != "":
        print('Issues dictionary:', PETRglobals.IssueFileName)
        issue_path = utilities._get_data('data/dictionaries',
                                         PETRglobals.IssueFileName)
        PETRreader.read_issue_list(issue_path)
コード例 #2
0
 def __init__(self, petrGlobal={}, config_folder='data/config/', config_file='PETR_config.ini'):
     # cli_args = petrarch2.parse_cli_args()
     if not petrGlobal:
         utilities.init_logger('PETRARCH.log')
         logger = logging.getLogger('petr_log')
         PETRglobals.RunTimeString = time.asctime()
         logger.info('Using Config file: ' + config_file)
         PETRreader.parse_Config(utilities._get_data(config_folder, config_file))
         petrarch2.read_dictionaries()
         print("SUCCESSFULL ON LOADING DICTIONARIES")
     else:
         print ("LOADING FROM MAP")
         self.load(petrGlobal)
コード例 #3
0
def test_date_check():
    parse = "(S (NP (NNP CARL ) (NN XVI ) (NNP GUSTAF ) ) )"

    test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEGOV"]

    test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate( "19720813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEELI"]

    test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate("19010813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEELI"]
コード例 #4
0
def test_date_check():
    parse = "(S (NP (NNP CARL ) (NN XVI ) (NNP GUSTAF ) ) )"

    test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEGOV"]

    test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate( "19720813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEELI"]

    test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate("19010813"))
    phrase = test.tree.children[0]
    assert phrase.get_meaning() == ["SWEELI"]
コード例 #5
0
def test_personal1():
    parse = "(S (NP (NNP Obama ) ) (VP (VBD said ) (SBAR (S (NP (PRP he ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) ) ".upper()

    print('This is a test')
    test = ptree.Sentence(parse,"Obama said he was tired",PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1].children[0].children[0]
    assert phrase.get_meaning() == ["USAGOV"]
コード例 #6
0
def test_reflexive():
    parse = "(S (NP (NNP Obama ) )  (VP (VBD asked ) (NP (PRP himself ) )  (SBAR (WHADVP (WRB why ) ) (S (NP (NNP Biden ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) )".upper(
    )

    test = ptree.Sentence(parse, "Obama asked himself why Biden was tired",
                          PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1]
    assert phrase.get_meaning() == ["USAGOV"]
コード例 #7
0
def test_reflexive2():
    parse = "(S (NP (NNP Obama ) ) (VP (VBD knew ) (SBAR (IN that ) (S (NP (NNP Putin ) ) (VP (VBD liked ) (NP (PRP himself ) ) ) ) ) )  ) ".upper(
    )

    test = ptree.Sentence(parse, "Obama knew that Biden liked him",
                          PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1].children[1].children[
        1].children[1]
    assert phrase.get_meaning() == ["RUSGOV"]
コード例 #8
0
def test_personal1():
    parse = "(S (NP (NNP Obama ) ) (VP (VBD said ) (SBAR (S (NP (PRP he ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) ) ".upper(
    )

    print('This is a test')
    test = ptree.Sentence(parse, "Obama said he was tired",
                          PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1].children[0].children[0]
    assert phrase.get_meaning() == ["USAGOV"]
コード例 #9
0
ファイル: petrarch2.py プロジェクト: langstok/petrarch2
def run(filepaths, out_file, s_parsed):
    # this is the routine called from main()
    events = PETRreader.read_xml_input(filepaths, s_parsed)
    if not s_parsed:
        events = utilities.stanford_parse(events)
    updated_events = do_coding(events)
    if PETRglobals.NullVerbs:
        PETRwriter.write_nullverbs(updated_events, 'nullverbs.' + out_file)
    elif PETRglobals.NullActors:
        PETRwriter.write_nullactors(updated_events, 'nullactors.' + out_file)
    else:
        PETRwriter.write_events(updated_events, 'evts.' + out_file)
コード例 #10
0
ファイル: petrarch2.py プロジェクト: langstok/petrarch2
def run_pipeline(data,
                 out_file=None,
                 config=None,
                 write_output=True,
                 parsed=False):
    # this is called externally
    utilities.init_logger('PETRARCH.log')
    logger = logging.getLogger('petr_log')
    if config:
        print('Using user-specified config: {}'.format(config))
        logger.info('Using user-specified config: {}'.format(config))
        PETRreader.parse_Config(config)
    else:
        logger.info('Using default config file.')
        logger.info('Config path: {}'.format(
            utilities._get_data('data/config/', 'PETR_config.ini')))
        PETRreader.parse_Config(
            utilities._get_data('data/config/', 'PETR_config.ini'))

    read_dictionaries()

    logger.info('Hitting read events...')
    events = PETRreader.read_pipeline_input(data)
    if parsed:
        logger.info('Hitting do_coding')
        updated_events = do_coding(events)
    else:
        events = utilities.stanford_parse(events)
        updated_events = do_coding(events)
    if not write_output:
        output_events = PETRwriter.pipe_output(updated_events)
        return output_events
    elif write_output and not out_file:
        print('Please specify an output file...')
        logger.warning('Need an output file. ¯\_(ツ)_/¯')
        sys.exit()
    elif write_output and out_file:
        PETRwriter.write_events(updated_events, out_file)
コード例 #11
0
ファイル: petrarch2.py プロジェクト: langstok/petrarch2
def main():
    cli_args = parse_cli_args()
    utilities.init_logger('PETRARCH.log')
    logger = logging.getLogger('petr_log')

    PETRglobals.RunTimeString = time.asctime()

    print(cli_args)
    if cli_args.config:
        print('Using user-specified config: {}'.format(cli_args.config))
        logger.info('Using user-specified config: {}'.format(cli_args.config))
        PETRreader.parse_Config(cli_args.config)
    else:
        logger.info('Using default config file.')
        PETRreader.parse_Config(
            utilities._get_data('data/config/', 'PETR_config.ini'))

    if cli_args.nullverbs:
        print('Coding in null verbs mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get verb phrases that are not in the dictionary but are
        # associated with coded noun phrases
        PETRglobals.NullVerbs = True
    elif cli_args.nullactors:
        print('Coding in null actors mode; no events will be generated')
        logger.info('Coding in null verbs mode; no events will be generated')
        # Only get actor phrases that are not in the dictionary but
        # associated with coded verb phrases
        PETRglobals.NullActors = True
        PETRglobals.NewActorLength = int(cli_args.nullactors)

    read_dictionaries()
    start_time = time.time()
    print('\n\n')

    paths = PETRglobals.TextFileList
    if cli_args.inputs:
        if os.path.isdir(cli_args.inputs):
            if cli_args.inputs[-1] != '/':
                paths = glob.glob(cli_args.inputs + '/*.xml')
            else:
                paths = glob.glob(cli_args.inputs + '*.xml')
        elif os.path.isfile(cli_args.inputs):
            paths = [cli_args.inputs]
        else:
            print(
                '\nFatal runtime error:\n"' + cli_args.inputs +
                '" could not be located\nPlease enter a valid directory or file of source texts.'
            )
            sys.exit()

    out = ""  # PETRglobals.EventFileName
    if cli_args.outputs:
        out = cli_args.outputs

    if cli_args.command_name == 'parse':
        run(paths, out, cli_args.parsed)

    else:
        run(paths, out, True)  # <===

    print("Coding time:", time.time() - start_time)

    print("Finished")
コード例 #12
0
ファイル: petrarch2.py プロジェクト: langstok/petrarch2
def do_coding(event_dict):
    """
    Main coding loop Note that entering any character other than 'Enter' at the
    prompt will stop the program: this is deliberate.
    <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first
                sentence of the *next* story
    """

    treestr = ""

    NStory = 0
    NSent = 0
    NEvents = 0
    NEmpty = 0
    NDiscardSent = 0
    NDiscardStory = 0

    logger = logging.getLogger('petr_log')
    times = 0
    sents = 0
    for key, val in sorted(event_dict.items()):
        NStory += 1
        prev_code = []

        SkipStory = False
        print('\n\nProcessing story {}'.format(key))
        StoryDate = event_dict[key]['meta']['date']
        for sent in val['sents']:
            NSent += 1
            if 'parsed' in event_dict[key]['sents'][sent]:
                if 'config' in val['sents'][sent]:
                    for _, config in event_dict[key]['sents'][sent][
                            'config'].items():
                        change_Config_Options(config)

                SentenceID = '{}_{}'.format(key, sent)
                SentenceText = event_dict[key]['sents'][sent]['content']
                SentenceDate = event_dict[key]['sents'][sent][
                    'date'] if 'date' in event_dict[key]['sents'][
                        sent] else StoryDate
                Date = PETRreader.dstr_to_ordate(SentenceDate)

                print("\n", SentenceID)
                parsed = event_dict[key]['sents'][sent]['parsed']
                treestr = parsed
                disc = check_discards(SentenceText)
                if disc[0] > 0:
                    if disc[0] == 1:
                        print("Discard sentence:", disc[1])
                        logger.info('\tSentence discard. {}'.format(disc[1]))
                        NDiscardSent += 1
                        continue
                    else:
                        print("Discard story:", disc[1])
                        logger.info('\tStory discard. {}'.format(disc[1]))
                        SkipStory = True
                        NDiscardStory += 1
                        break

                t1 = time.time()
                sentence = PETRtree.Sentence(treestr, SentenceText, Date)
                print(sentence.txt)
                # this is the entry point into the processing in PETRtree
                coded_events, meta = sentence.get_events()
                code_time = time.time() - t1
                if PETRglobals.NullVerbs or PETRglobals.NullActors:
                    event_dict[key]['meta'] = meta
                    event_dict[key]['text'] = sentence.txt
                elif PETRglobals.NullActors:
                    event_dict[key]['events'] = coded_events
                    coded_events = None  # skips additional processing
                    event_dict[key]['text'] = sentence.txt
                else:
                    # 16.04.30 pas: we're using the key value 'meta' at two
                    # very different
                    event_dict[key]['meta']['verbs'] = meta
                    # levels of event_dict -- see the code about ten lines below -- and
                    # this is potentially confusing, so it probably would be useful to
                    # change one of those

                del (sentence)
                times += code_time
                sents += 1
                # print('\t\t',code_time)

                if coded_events:
                    event_dict[key]['sents'][sent]['events'] = coded_events
                    event_dict[key]['sents'][sent]['meta'] = meta
                    #print('DC-events:', coded_events) # --
                    #print('DC-meta:', meta) # --
                    #print('+++',event_dict[key]['sents'][sent])  # --
                    if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot:
                        text_dict = utilities.extract_phrases(
                            event_dict[key]['sents'][sent], SentenceID)
                        # --                        print('DC-td1:',text_dict) # --
                        if text_dict:
                            event_dict[key]['sents'][sent]['meta'][
                                'actortext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'eventtext'] = {}
                            event_dict[key]['sents'][sent]['meta'][
                                'actorroot'] = {}
                            # --                            print('DC1:',text_dict) # --
                            for evt in coded_events:
                                if evt in text_dict:  # 16.04.30 pas bypasses problems with expansion of compounds
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actortext'][evt] = text_dict[evt][:2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'eventtext'][evt] = text_dict[evt][2]
                                    event_dict[key]['sents'][sent]['meta'][
                                        'actorroot'][evt] = text_dict[evt][3:5]

                if coded_events and PETRglobals.IssueFileName != "":
                    event_issues = get_issues(SentenceText)
                    if event_issues:
                        event_dict[key]['sents'][sent]['issues'] = event_issues

                if PETRglobals.PauseBySentence:
                    if len(input("Press Enter to continue...")) > 0:
                        sys.exit()

                prev_code = coded_events
                NEvents += len(coded_events)
                if len(coded_events) == 0:
                    NEmpty += 1
            else:
                logger.info(
                    '{} has no parse information. Passing.'.format(SentenceID))
                pass

        if SkipStory:
            event_dict[key]['sents'] = None

    print("\nSummary:")
    print("Stories read:", NStory, "   Sentences coded:", NSent,
          "  Events generated:", NEvents)
    print("Discards:  Sentence", NDiscardSent, "  Story", NDiscardStory,
          "  Sentences without events:", NEmpty)
    print("Average Coding time = ", times / sents if sents else 0)
    # --    print('DC-exit:',event_dict)
    return event_dict
コード例 #13
0
def test_reflexive2():
    parse = "(S (NP (NNP Obama ) ) (VP (VBD knew ) (SBAR (IN that ) (S (NP (NNP Putin ) ) (VP (VBD liked ) (NP (PRP himself ) ) ) ) ) )  ) ".upper()

    test = ptree.Sentence(parse,"Obama knew that Biden liked him",PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1].children[1].children[1].children[1]
    assert phrase.get_meaning() == ["RUSGOV"]
コード例 #14
0
def test_reflexive():
    parse = "(S (NP (NNP Obama ) )  (VP (VBD asked ) (NP (PRP himself ) )  (SBAR (WHADVP (WRB why ) ) (S (NP (NNP Biden ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) )".upper()

    test = ptree.Sentence(parse,"Obama asked himself why Biden was tired",PETRreader.dstr_to_ordate("20150813"))
    phrase = test.tree.children[1].children[1]
    assert phrase.get_meaning() == ["USAGOV"]
コード例 #15
0
from petrarch2 import petrarch2, PETRglobals, PETRreader, PETRtree, utilities
from ConfigParser import ConfigParser
from flask import jsonify, make_response
from flask.ext.httpauth import HTTPBasicAuth
from flask.ext.restful import Resource, reqparse
from flask.ext.restful.representations.json import output_json
import os

config = "/app/resources/PETR_config.ini"
PETRreader.parse_Config(config)
petrarch2.read_dictionaries()

class PhraseExtractAPI(Resource):
    def __init__(self):
        self.reqparse = reqparse.RequestParser()
        self.reqparse.add_argument('text', type=unicode, location='json')
        self.reqparse.add_argument('parse', type=unicode, location='json')
        super(PhraseExtractAPI, self).__init__()

    def get(self):
        return """ This service expects a POST in the form '{"text":""Airstrikes 
    and artillery...", "parse" : "(ROOT (S (S (NP (NP (NNP Airstrikes)) 
    (CC and) (NP (NN artillery)))..."}' It will return a list of nouns and verbs...TBD"""

    def post(self):
        args = self.reqparse.parse_args()
        print args
        text = args['text']
        parse = args['parse']
        output = self.get_phrases(text, parse)
        return output