Beispiel #1
0
def get_sentence(naf: KafNafParser, term: Cterm) -> int:
    tokens = [
        naf.get_token(tid)
        for tid in naf.get_dict_tokens_for_termid(term.get_id())
    ]
    sent = {t.get_sent() for t in tokens}
    if len(sent) != 1:
        raise Exception(
            f"Term {term.get_id}:{term.get_lemma()} did not map to single sentence: {sent}"
        )
    return sent.pop()
Beispiel #2
0
def main(inputfile,
         this_type,
         folder,
         overall_parameters={},
         detected_dse={},
         log=False):
    files = []
    output_fd = None
    if this_type == 'train':
        output_fd = open(folder + '/' + TRAINING_FILENAME, 'w')

        ##Save the parametes
        parameter_filename = os.path.join(folder, PARAMETERS_FILENAME)
        fd_parameter = open(parameter_filename, 'w')
        pickler.dump(overall_parameters, fd_parameter, protocol=0)
        print >> sys.stderr, 'Parameters saved to file %s' % parameter_filename
        fd_parameter.close()

        #Input is a files with a list of files
        fin = open(inputfile, 'r')
        for line in fin:
            files.append(line.strip())
        fin.close()

    elif this_type == 'tag':
        parameter_filename = os.path.join(folder, PARAMETERS_FILENAME)
        fd_param = open(parameter_filename, 'r')
        overall_parameters = pickler.load(fd_param)
        fd_param.close()

        #Input is a isngle file
        files.append(inputfile)

        #Output FD will be a temporary file
        output_fd = tempfile.NamedTemporaryFile('w', delete=False)
    elif this_type == 'test':
        parameter_filename = os.path.join(folder, PARAMETERS_FILENAME)
        fd_param = open(parameter_filename, 'r')
        these_overall_parameters = pickler.load(fd_param)
        fd_param.close()
        for opt, val in these_overall_parameters.items():
            overall_parameters[opt] = val

        #Input is a files with a list of files
        fin = open(inputfile, 'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        output_fd = open(folder + '/' + TESTING_FILENAME, 'w')

    gold_fd = None
    gold_filename = overall_parameters.get('gold_standard')
    if gold_filename is not None:
        gold_fd = open(gold_filename, 'w')

    for filename in files:
        if log:
            print >> sys.stderr, 'HOLDER: processing file', filename

        if isinstance(filename, KafNafParser):
            naf_obj = filename
        else:
            naf_obj = KafNafParser(filename)

        create_structures(naf_obj, filename)

        #Extract all the opinions
        opinions_per_sentence = defaultdict(list)

        num_opinions = 0

        for opinion in naf_obj.get_opinions():
            exp = opinion.get_expression()
            if exp is not None:
                p = exp.get_polarity()
                if p != 'NON-OPINIONATED':
                    #if p.startswith('D-'):
                    holder = opinion.get_holder()
                    if holder is not None:
                        span = holder.get_span()
                        if span is not None:
                            span_ids = span.get_span_ids()
                            if len(span_ids) != 0:
                                sentence_id = get_sentence_id_for_opinion(
                                    naf_obj, opinion)
                                if sentence_id is not None:
                                    opinions_per_sentence[sentence_id].append(
                                        opinion)
                                    num_opinions += 1

        if log:
            print >> sys.stderr, '\tNum of opinions:', num_opinions

        if this_type == 'train':
            # For the train a sequence is created for every opinion
            #One sequence is created for every DSE (possible to have repeated sentences)
            sentences_with_opinions = set()
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for opinion in these_opinions:
                    sentences_with_opinions.add(this_sentence)
                    create_sequence(naf_obj,
                                    this_type,
                                    this_sentence,
                                    overall_parameters,
                                    opinion,
                                    output=output_fd)

            #Include the rest of sentence without opinions
            '''
            for sentence_id in naf_obj.list_sentence_ids:
                if sentence_id not in sentences_with_opinions:
                    create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions=[])
            '''

        elif this_type == 'tag':
            # Obtain the opinions per sentence per
            opinions_per_sentence = defaultdict(list)
            for list_name_ids, list_words in detected_dse:
                list_ids = [v[v.rfind('#') + 1:] for v in list_name_ids]
                first_token = naf_obj.get_token(list_ids[0])
                sentence_for_opinion = first_token.get_sent()
                opinions_per_sentence[sentence_for_opinion].append(list_ids)

            for this_sentence, these_opinions in opinions_per_sentence.items():
                for list_dse_token_ids in these_opinions:
                    create_sequence(naf_obj,
                                    this_type,
                                    this_sentence,
                                    overall_parameters,
                                    opinion=list_dse_token_ids,
                                    output=output_fd,
                                    log=log)

        elif this_type == 'test':
            opinion_list = []
            '''
            for sentence_id in naf_obj.list_sentence_ids:
                if sentence_id in opinions_per_sentence:
                    for this_sentence, these_opinions in opinions_per_sentence.items():
                        for opinion in these_opinions:
                            create_sequence(naf_obj, this_type, this_sentence, overall_parameters,opinion, output = output_fd)
                            opinion_list.append(opinion)
                else:
                    create_sequence(naf_obj, this_type, sentence_id, overall_parameters,opinion=None, output = output_fd)
               
            '''
            #For the testing, one sequence is created for every sentence, with no opinion included
            opinion_list = []
            #WE include only the the sentences where there are opinions
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for opinion in these_opinions:
                    create_sequence(naf_obj,
                                    this_type,
                                    this_sentence,
                                    overall_parameters,
                                    opinion,
                                    output=output_fd)
                    opinion_list.append(opinion)

            ## Create the gold standard data also
            if gold_fd is not None:
                create_gold_standard_holder(naf_obj, opinion_list, gold_fd)

    if gold_fd is not None:
        gold_fd.close()
        print >> sys.stderr, 'Gold standard in the file %s' % gold_fd.name

    return output_fd.name
Beispiel #3
0
def get_word(naf: KafNafParser, term: Cterm) -> str:
    """Get the word(s) belonging to a term, joining them if there's more than one"""
    tokenids = naf.get_dict_tokens_for_termid(term.get_id())
    tokens = sort_tokens(naf.get_token(tid) for tid in tokenids)
    return " ".join(t.get_text() for t in tokens)
Beispiel #4
0
#!/usr/bin/env python

from KafNafParserPy import KafNafParser
import sys

if __name__ == '__main__':
    naf_obj = KafNafParser(sys.stdin)

    candidates = set(['is', 'are', 'were', 'was', "'s"])
    for term in naf_obj.get_terms():
        if term.get_lemma() == 'i':
            token_id = term.get_span().get_span_ids()[0]
            token = naf_obj.get_token(token_id).get_text()
            if token in candidates:
                term.set_lemma('be')
                term.set_pos('VBZ')

    naf_obj.dump()
Beispiel #5
0
            ):  #get the entities identified through NER
                if entity.get_type(
                ) == "LOCATION":  #if it's a location, then let's get its term references
                    entity_type = entity.get_type()  #get the entity type
                    entity_id = entity.get_id()  #get the entity id
                    ref = entity.get_node().find(
                        "references"
                    )  #find the references node in the KAF file
                    targets = ref.find("span").findall(
                        "target"
                    )  #get a list of all the targets in the references node
                    term_id = targets[0].attrib[
                        "id"]  #get the first target term id, so we can generate the term text in the next two lines
                    word_id = my_parser.get_term(term_id).get_node().find(
                        "span").find("target").attrib["id"]  #get the word id
                    word = my_parser.get_token(
                        word_id).get_text()  #get the word text

                    #simpledict[word]=freqdict[word] #generate a simple tally that ignores whether each LOCATION is valid;
                    #commented out since OpenRefine will be used to validate

                    sentence = ""  #start a blank KWIC sentence variable
                    for target in targets:  #iterate through all targets in the references element for the entity node
                        tid = int(
                            my_parser.get_term(
                                target.attrib["id"]).get_node().find(
                                    "span").find("target").attrib["id"]
                            [1:])  #get the token id for the target
                        for x in range(
                                tid - 15, tid + 15
                        ):  #iterate through the tokens 15 before and 15 after the target
                            if my_parser.get_token("w" + str(x)):
def main(inputfile, this_type, folder, overall_parameters = {}, detected_dse = {},log=False):
    files = []
    output_fd = None
    if this_type == 'train':
        output_fd = open(folder+'/'+TRAINING_FILENAME,'w')
            
        ##Save the parametes
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_parameter = open(parameter_filename,'w')
        pickler.dump(overall_parameters,fd_parameter,protocol=0)
        print>>sys.stderr,'Parameters saved to file %s' % parameter_filename
        fd_parameter.close()
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        
    elif this_type == 'tag':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'r')
        overall_parameters = pickler.load(fd_param)
        fd_param.close()

        #Input is a isngle file
        files.append(inputfile)
        
        #Output FD will be a temporary file
        output_fd = tempfile.NamedTemporaryFile('w', delete=False)
    elif this_type == 'test':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'r')
        these_overall_parameters = pickler.load(fd_param)
        fd_param.close()
        for opt, val in these_overall_parameters.items():
            overall_parameters[opt] = val
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        output_fd = open(folder+'/'+TESTING_FILENAME,'w')
     
      
    gold_fd = None    
    gold_filename = overall_parameters.get('gold_standard')
    if gold_filename is not None:
        gold_fd = open(gold_filename ,'w')
          

    for filename in files:
        if log:
            print>>sys.stderr,'TARGET: processing file', filename
        
        if isinstance(filename,KafNafParser):
            naf_obj = filename
        else:
            naf_obj = KafNafParser(filename)
            
        create_structures(naf_obj, filename)
        
        #Extract all the opinions
        opinions_per_sentence = defaultdict(list)
        num_opinions = 0
       
        
        for opinion in naf_obj.get_opinions():
            exp = opinion.get_expression()
            if exp is not None:
                p = exp.get_polarity()
                if p != 'NON-OPINIONATED':
                    target = opinion.get_target()
                    if target is not None:  
                        span = target.get_span()
                        if span is not None:
                            S = span.get_span_ids()
                            if len(S) != 0:    
                                sentence_id = get_sentence_id_for_opinion(naf_obj,opinion)
                                if sentence_id is not None:
                                    opinions_per_sentence[sentence_id].append(opinion)
                                    num_opinions += 1
                    
        if log:
            print>>sys.stderr,'\tNum of opinions:', num_opinions
        
        if this_type == 'train':
            # For the train a sequence is created for every opinion
            #One sequence is created for every DSE (possible to have repeated sentences)
            sentences_with_opinions = set()
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for opinion in these_opinions:
                    sentences_with_opinions.add(this_sentence)
                    create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion, output = output_fd)
            
            #Include the rest of sentence without opinions
            '''
            for sentence_id in naf_obj.list_sentence_ids:
                if sentence_id not in sentences_with_opinions:
                    create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions=[])
            '''
                
        elif this_type=='tag':
            # Obtain the opinions per sentence per
            opinions_per_sentence = defaultdict(list)
            for list_name_ids, list_words in detected_dse:
                list_ids = [v[v.rfind('#')+1:] for v in list_name_ids]
                first_token = naf_obj.get_token(list_ids[0])
                sentence_for_opinion = first_token.get_sent()
                opinions_per_sentence[sentence_for_opinion].append(list_ids)
                
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for list_dse_token_ids in these_opinions:
                    create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion = list_dse_token_ids, output = output_fd,log=log)  

        elif this_type=='test':
            #For the testing, one sequence is created for every sentence, with no opinion included
            opinion_list = []
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for opinion in these_opinions:
                    create_sequence(naf_obj, this_type, this_sentence, overall_parameters,opinion, output = output_fd)
                    opinion_list.append(opinion)
   
            if gold_fd is not None:
                create_gold_standard_target(naf_obj,opinion_list,gold_fd)
            
            
    if gold_fd is not None:
        gold_fd.close() 
        print>>sys.stderr,'Gold standard in the file %s' % gold_fd.name
        
    return output_fd.name