예제 #1
0
def test_corenlp2naf():
    xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read()
    naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER)
    naf = KafNafParser(BytesIO(naf_bytes))
    terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()}
    assert_equal(set(terms.values()), {"John", "attack", "I", "in", "London", "hit", "he", "back", "."})
    london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0]
    assert_equal(london.get_pos(), 'R')
    assert_equal(london.get_morphofeat(), 'NNP')

    ents = {}
    for e in naf.get_entities():
        for ref in e.get_references():
            for term_id in ref.get_span().get_span_ids():
                ents[terms[term_id]] = e.get_type()
    assert_equal(ents, {"John": "PERSON", "London": "LOCATION"})

    deps = {terms[d.get_from()]: (d.get_function(), terms[d.get_to()])
            for d in naf.get_dependencies()}
    expected = {'I': ('nsubj', 'hit'),
                'John': ('nsubj', 'attack'),
                'London': ('prep_in', 'attack'),
                'back': ('advmod', 'hit'),
                'he': ('dobj', 'hit')}
    assert_equal(deps, expected)

    corefs = []
    for coref in naf.get_corefs():
        corefs.append(set())
        for span in coref.get_spans():
            corefs[-1] |= {terms[t] for t in span.get_span_ids()}
    assert_in({"John", "he"}, corefs)
def test_create_terms():
    """
    Can we create_terms via the create_{term,token} functions?
    """

    naf = KafNafParser(type="NAF")
    sent = 1
    offset = 0
    input = [(u'dit', u'dit', u'O', u'VNW'), (u'is', u'zijn', u'V', u'WW'),
             (u'een', u'een', u'D', u'LID'), (u'test', u'test', u'N', u'N')]

    offset = 0
    for (word, lemma, pos, morph) in input:
        token = naf.create_wf(word, 1, offset)
        offset += len(word)
        term = naf.create_term(lemma, pos, morph, [token])

    tokens = {t.get_id(): t for t in naf.get_tokens()}
    assert_equal(len(tokens), 4)

    result = {}
    for term in naf.get_terms():
        for token_id in term.get_span().get_span_ids():
            token = tokens[token_id]
            result[term.get_id()] = (token.get_text(), term.get_lemma(),
                                     term.get_pos(), term.get_morphofeat())
    result = [result[tid] for tid in sorted(result.keys())]
    assert_equal(input, result)
    def process_single_file(self,file):
        try:
            xml_obj = KafNafParser(file)
        except:
            print>>sys.stderr,'Error parsing',file,': skipped'
            return        


        print>>sys.stderr,'Processing file', os.path.basename(file), 'Type:',xml_obj.get_type()
        self.langs[xml_obj.get_language()] += 1
        sentences = []
        current_sent = []
        this_sent = None
                 
                 
               
         
        pos_for_wid = {} ## For each token id (wid) the pos of it
        for term in xml_obj.get_terms():
            w_ids = term.get_span().get_span_ids()
            pos = term.get_pos()
            for wid in term.get_span().get_span_ids():
                pos_for_wid[wid] = pos

            
        for token in xml_obj.get_tokens():
            wid = token.get_id()
            value = token.get_text()
            if self.convert_to_lowercase:
                value = value.lower()
                
            if value in self.punctuation:
                value = 'PUN'
                
            if value == '*':
                value = 'STAR'
            
            sentence = token.get_sent()
            if this_sent is not None and sentence != this_sent:  ## There is a new sent
                sentences.append(current_sent)
                current_sent = []
            current_sent.append((wid,value))
            this_sent = sentence
        ## Add the last sentence as well
        sentences.append(current_sent)
        
        for sentence in sentences:
            if self.include_sentence_delimiters:
                sentence.insert(0,('xxx','<S>'))
                sentence.append(('xxx','</S>'))
        
            for idx in range(0,len(sentence)):
                for ngramlen in range(self.min_ngram_len,self.max_ngram_len+1):
                    file_desc = self.get_file_desc_for_ngram(ngramlen)
                    start = idx
                    end = start + ngramlen
                    if end <= len(sentence):
                        this_ngram = '\t'.join(value for wid, value in sentence[start:end])
                        this_ngram_pos = '\t'.join(pos_for_wid.get(wid,'X') for wid, value in sentence[start:end])
                        file_desc.write(this_ngram.encode('utf-8')+'\t'+DELIMITER+'\t'+this_ngram_pos+'\n')
예제 #4
0
def test_create_terms():
    """
    Can we create_terms via the create_{term,token} functions?
    """
    
    naf = KafNafParser(type="NAF")
    sent=1; offset=0
    input = [(u'dit', u'dit', u'O', u'VNW'),
             (u'is', u'zijn', u'V', u'WW'),
             (u'een', u'een', u'D', u'LID'),
             (u'test', u'test', u'N', u'N')]

    offset = 0
    for (word, lemma, pos, morph) in input:
        token = naf.create_wf(word, 1, offset)
        offset += len(word)
        term = naf.create_term(lemma, pos, morph, [token])

    tokens = {t.get_id(): t for t in naf.get_tokens()}
    assert_equal(len(tokens), 4)
    
    result = {}
    for term in naf.get_terms():
        for token_id in term.get_span().get_span_ids():
            token = tokens[token_id]
            result[term.get_id()] = (token.get_text(), term.get_lemma(),
                                     term.get_pos(), term.get_morphofeat())
    result = [result[tid] for tid in sorted(result.keys())]
    assert_equal(input, result)
예제 #5
0
파일: tokens.py 프로젝트: aemal/amcat
    def from_naf(self, article, naf):
        def _int(x):
            return None if x is None else int(x)
        naf = KafNafParser(BytesIO(naf.encode("utf-8")))

        deps = {dep.get_to(): (dep.get_function(), dep.get_from())
                for dep in naf.get_dependencies()}
        tokendict = {token.get_id(): token for token in naf.get_tokens()}

        for term in naf.get_terms():
            tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
            for token in tokens:
                tid = term.get_id()
                tok = {"aid": article,
                       "token_id": token.get_id(),
                       "offset": _int(token.get_offset()),
                       "sentence": _int(token.get_sent()),
                       "para": _int(token.get_para()),
                       "word": token.get_text(),
                       "term_id": tid,
                       "lemma": term.get_lemma(),
                       "pos": term.get_pos()}
                if tid in deps:
                    rel, parent = deps[tid]
                    tok['parent'] = parent
                    tok['relation'] = rel.split("/")[-1]
                yield tok
예제 #6
0
    def convert(self, id, result, format):
        assert format == "csv"

        _int = lambda x: None if x is None else int(x)
        naf = KafNafParser(BytesIO(result.encode("utf-8")))

        deps = {dep.get_to(): (dep.get_function(), dep.get_from())
                for dep in naf.get_dependencies()}
        tokendict = {token.get_id(): token for token in naf.get_tokens()}

        s = StringIO()
        w = csv.writer(s)
        w.writerow(["id", "token_id", "offset", "sentence", "para", "word", "term_id",
                    "lemma", "pos", "pos1", "parent", "relation"])
        for term in naf.get_terms():
            tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
            for token in tokens:
                tid = term.get_id()
                pos = term.get_pos()
                pos1 = POSMAP[pos]
                row = [id,  token.get_id(), _int(token.get_offset()), _int(token.get_para()), token.get_text(),
                       tid, term.get_lemma(), pos, pos1]
                if tid in deps:
                    rel, parent = deps[tid]
                    row += [parent, rel.split("/")[-1]]
                else:
                    row += [None, None]
                w.writerow(row)
        return s.getvalue()
예제 #7
0
파일: test_frog.py 프로젝트: amcat/nlpipe
def test_frog_saf():
    _check_frog()
    naf_str = frog._process("Mark Rutte werkte gisteren nog bij de  Vrije Universiteit in Amsterdam")

    naf = KafNafParser(BytesIO(naf_str))
    lemmata = {t.get_lemma() for t in naf.get_terms()}
    assert_equal(lemmata, {"Mark_Rutte", "werken", "gisteren", "nog", "bij",
                           "de", "vrij", "universiteit", "in", "Amsterdam"})
예제 #8
0
def test_corenlp_naf():
    _check_corenlp()
    naf_bytes = corenlp.corenlp_naf("John shoots himself", annotators=corenlp.LEMMATIZER)
    print naf_bytes
    naf = KafNafParser(BytesIO(naf_bytes))

    terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()}
    assert_equal(set(terms.values()), {"John", "shoot", "himself"})
예제 #9
0
def test_corenlp_naf():
    _check_corenlp()
    naf_bytes = corenlp.corenlp_naf("John shoots himself",
                                    annotators=corenlp.LEMMATIZER)
    print naf_bytes
    naf = KafNafParser(BytesIO(naf_bytes))

    terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()}
    assert_equal(set(terms.values()), {"John", "shoot", "himself"})
예제 #10
0
파일: test_frog.py 프로젝트: amcat/nlpipe
def test_frog_saf():
    _check_frog()
    naf_str = frog._process(
        "Mark Rutte werkte gisteren nog bij de  Vrije Universiteit in Amsterdam"
    )

    naf = KafNafParser(BytesIO(naf_str))
    lemmata = {t.get_lemma() for t in naf.get_terms()}
    assert_equal(
        lemmata, {
            "Mark_Rutte", "werken", "gisteren", "nog", "bij", "de", "vrij",
            "universiteit", "in", "Amsterdam"
        })
예제 #11
0
def test_corenlp2naf():
    xml = open(os.path.join(os.path.dirname(__file__),
                            "test_corenlp.xml")).read()
    naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER)
    naf = KafNafParser(BytesIO(naf_bytes))
    terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()}
    assert_equal(
        set(terms.values()),
        {"John", "attack", "I", "in", "London", "hit", "he", "back", "."})
    london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0]
    assert_equal(london.get_pos(), 'R')
    assert_equal(london.get_morphofeat(), 'NNP')

    ents = {}
    for e in naf.get_entities():
        for ref in e.get_references():
            for term_id in ref.get_span().get_span_ids():
                ents[terms[term_id]] = e.get_type()
    assert_equal(ents, {"John": "PERSON", "London": "LOCATION"})

    deps = {
        terms[d.get_from()]: (d.get_function(), terms[d.get_to()])
        for d in naf.get_dependencies()
    }
    expected = {
        'I': ('nsubj', 'hit'),
        'John': ('nsubj', 'attack'),
        'London': ('prep_in', 'attack'),
        'back': ('advmod', 'hit'),
        'he': ('dobj', 'hit')
    }
    assert_equal(deps, expected)

    corefs = []
    for coref in naf.get_corefs():
        corefs.append(set())
        for span in coref.get_spans():
            corefs[-1] |= {terms[t] for t in span.get_span_ids()}
    assert_in({"John", "he"}, corefs)
예제 #12
0
파일: nlpipe.py 프로젝트: BBie/amcat
    def from_naf(self, naf):
        naf = KafNafParser(BytesIO(naf.encode("utf-8")))
        tokendict = {token.get_id(): token for token in naf.get_tokens()}

        for term in naf.get_terms():
            tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
            for token in tokens:
                yield {"aid": article.pk,
                       "token_id": token.get_id(),
                       "offset": token.get_offset(),
                       "sentence": token.get_sent(),
                       "para": token.get_para(),
                       "word": token.get_text(),
                       "term_id": term.get_id(),
                       "lemma": term.get_lemma(),
                       "pos": term.get_pos()}
예제 #13
0
 def convert(self, id, result, format):
     assert format == "csv"
     naf = KafNafParser(BytesIO(result.encode("utf-8")))
     memo = self._csv_memo(naf)
     tokendict = {token.get_id(): token for token in naf.get_tokens()}
     s = StringIO()
     w = csv.writer(s)
     w.writerow(self._csv_header())
     for term in naf.get_terms():
         tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
         for token in tokens:
             tid = term.get_id()
             pos = term.get_pos()
             pos1 = POSMAP[pos]
             row = [id] + list(self._csv_row(memo, term, token))
             w.writerow(row)
     return s.getvalue()
예제 #14
0
def read_training_data(file_name):
    """
    read kaf/naf and matches the aspects with the words
    """
    parser = KafNafParser(PATH_ANNOTATED_DATA + file_name)
    terms = list(parser.get_terms())
    #    create token dictionairy containing naf info
    tokens_container = dict()
    for token_el in parser.get_tokens():
        token_node = token_el.node
        token_id = token_node.get('wid').replace('w', 't')
        token_info = token_node.attrib
        tokens_container[token_id] = token_info
    properties = list(parser.get_properties())
    handled_properties, term_dict = handle_properties(properties, terms,
                                                      tokens_container)
    return terms, properties, handled_properties, term_dict, tokens_container
예제 #15
0
def extract_data_file(filename, label_gold, label_system, this_temp_folder=None, get_random=False):
    if this_temp_folder is None:
        temp_folder = mkdtemp()
    else:
        temp_folder = this_temp_folder
        
    fd_gold = open(temp_folder+'/'+__gold_filename__,'a')
    fd_system = open(temp_folder+'/'+__system_filename__, 'a')
    
    input_obj = KafNafParser(filename)
    for term in input_obj.get_terms():
        #Get gold
        term_id = term.get_id()
        results_gold = []
        results_system = []
        for ext_ref in term.get_external_references():
            resource = ext_ref.get_resource()
            if resource == label_gold:
                results_gold.append((ext_ref.get_reference(),ext_ref.get_confidence()))
            elif resource == label_system:
                results_system.append((ext_ref.get_reference(),ext_ref.get_confidence()))
        
        if len(results_gold) > 0:
            best_gold_label, best_gold_value = get_max_from_list(results_gold)
            fd_gold.write(filename+'\t'+term_id+'\t'+best_gold_label+'\n')
            
            if get_random:
                 best_system_label, best_system_value = get_random_from_list(results_system)
            else:
                best_system_label, best_system_value = get_max_from_list(results_system)
                
            if best_system_label is not None:
                fd_system.write(filename+'\t'+term_id+'\t'+best_system_label+'\n')
    fd_gold.close()
    fd_system.close()
    
    #Create the "fake" sense.mappings
    fd_map = open(temp_folder+'/'+__sense_mapping__,'w')
    fd_map.close()
    return temp_folder
def _test_file(this_file):
    input_fd = open(this_file)
    
    result = subprocess.check_output(os.path.join(__here__,'run_parser.sh'), stdin=input_fd)
    my_obj = KafNafParser(BytesIO(result))
       
    
    #Check the terms
    terms = [term for term in my_obj.get_terms()]
    assert_equal(len(terms),12)
    assert_equal(my_obj.get_term('t_4').get_lemma(),'mooi')
    assert_equal(my_obj.get_term('t_4').get_pos(),'adj')
    
    
    #Check constituents
    trees = [tree for tree in my_obj.get_trees()]
    assert_equal(len(trees),2)
    assert_equal(trees[0].get_terminals_as_list()[1].get_span().get_span_ids(),['t_1'])
    
    #Check dependencies
    dependencies = [dep for dep in my_obj.get_dependencies()]
    assert_equal(len(dependencies),10)
    assert_equal(dependencies[5].get_function(),'hd/su')
예제 #17
0
    def process_single_file(self, file):
        try:
            xml_obj = KafNafParser(file)
        except:
            print >> sys.stderr, 'Error parsing', file, ': skipped'
            return

        print >> sys.stderr, 'Processing file', os.path.basename(
            file), 'Type:', xml_obj.get_type()
        self.langs[xml_obj.get_language()] += 1
        sentences = []
        current_sent = []
        this_sent = None

        pos_for_wid = {}  ## For each token id (wid) the pos of it
        for term in xml_obj.get_terms():
            w_ids = term.get_span().get_span_ids()
            pos = term.get_pos()
            for wid in term.get_span().get_span_ids():
                pos_for_wid[wid] = pos

        for token in xml_obj.get_tokens():
            wid = token.get_id()
            value = token.get_text()
            if self.convert_to_lowercase:
                value = value.lower()

            if value in self.punctuation:
                value = 'PUN'

            if value == '*':
                value = 'STAR'

            sentence = token.get_sent()
            if this_sent is not None and sentence != this_sent:  ## There is a new sent
                sentences.append(current_sent)
                current_sent = []
            current_sent.append((wid, value))
            this_sent = sentence
        ## Add the last sentence as well
        sentences.append(current_sent)

        for sentence in sentences:
            if self.include_sentence_delimiters:
                sentence.insert(0, ('xxx', '<S>'))
                sentence.append(('xxx', '</S>'))

            for idx in range(0, len(sentence)):
                for ngramlen in range(self.min_ngram_len,
                                      self.max_ngram_len + 1):
                    file_desc = self.get_file_desc_for_ngram(ngramlen)
                    start = idx
                    end = start + ngramlen
                    if end <= len(sentence):
                        this_ngram = '\t'.join(
                            value for wid, value in sentence[start:end])
                        this_ngram_pos = '\t'.join(
                            pos_for_wid.get(wid, 'X')
                            for wid, value in sentence[start:end])
                        file_desc.write(
                            this_ngram.encode('utf-8') + '\t' + DELIMITER +
                            '\t' + this_ngram_pos + '\n')
def process_file(this_file,token_freq):
    xml_obj = KafNafParser(this_file)
    print>>sys.stderr,'Processing file',this_file
    token_for_wid = {}
    order_for_wid = {}
    opinion_expressions = []
    opinion_targets = []
    whole_text = ' '
    for n, token in enumerate(xml_obj.get_tokens()):
        text = token.get_text().lower()
        token_freq[text] += 1
        token_for_wid[token.get_id()] = text
        order_for_wid[token.get_id()] = n
        whole_text += text + ' '
    wids_for_tid = {}
    lemma_for_wid = {}
    pos_for_wid = {}
    for term in xml_obj.get_terms():
        tid = term.get_id()
        wids = term.get_span().get_span_ids()
        wids_for_tid[tid] = wids
        for wid in wids:
            lemma_for_wid[wid] = term.get_lemma()
            pos_for_wid[wid] = term.get_pos()
        
    
    ##Properties!
    aspects = [] ## [(label,term_span)...]
    
    for property in xml_obj.get_properties():
        for refs in property.get_references():
            for span in refs:
                aspects.append((property.get_type(),span.get_span_ids()))
       
    
    
    already_counted = {EXP:set(), TAR:set()}
    
    for opinion in xml_obj.get_opinions():   
        for this_type, opinion_obj in [(EXP,opinion.get_expression()),(TAR,opinion.get_target())]:
            if this_type is EXP and opinion_obj.get_polarity()=='NON-OPINIONATED':
                continue
            if opinion_obj is not None:
                span = opinion_obj.get_span()
                if span is not None:
                    list_wids = []
                    for tid in span.get_span_ids():
                        list_wids.extend(wids_for_tid.get(tid,[]))
                    list_wids.sort(key=lambda wid: order_for_wid[wid])  ##Sorted according the the order of the tokens
                    
                    string_wids = '#'.join(list_wids)
                    opinion_tokens = ' '.join( token_for_wid[wid] for wid in list_wids)
                    opinion_lemmas = ' '.join( lemma_for_wid[wid] for wid in list_wids)
                    opinion_pos    = ' '.join( pos_for_wid[wid]   for wid in list_wids)
                    
                   
                    if string_wids not in already_counted[this_type]:
                        if this_type == EXP:
                            polarity = (opinion_obj.get_polarity()).lower()
                            opinion_expressions.append((opinion_tokens,polarity,opinion_lemmas,opinion_pos))
                        else:
                            ##Calculate the aspect type
                            possible_aspects = []
                            target_ids = span.get_span_ids()
                            for aspect_label, aspect_span in aspects:
                                num_in_common = len(set(target_ids) & set(aspect_span))
                                if num_in_common != 0:
                                    possible_aspects.append((aspect_label,num_in_common,len(aspect_span)))
                            aspect_for_target = 'unknown'

                            if len(possible_aspects) != 0:
                                ##Sorting by the number in common first, and by the lengtgh of the aspect secondly
                                aspect_for_target = sorted(possible_aspects,key=lambda t: (t[1],t[2]), reverse=True)[0][0]
                            opinion_targets.append((opinion_tokens,aspect_for_target, opinion_lemmas,opinion_pos))
                        already_counted[this_type].add(string_wids)    
      
    del xml_obj
    print>>sys.stderr,'\tNumber of opinion expressions:',len(opinion_expressions)
    print>>sys.stderr,'\tNumber of opinion targets:',len(opinion_targets)
    print>>sys.stderr,'\tNumber of characters of the text:',len(whole_text)
    return opinion_expressions, opinion_targets, whole_text
예제 #19
0
import sys

if __name__ == '__main__':

    #Load Wordnet
    synset_for_skey = {}
    path_to_index_sense = '/home/izquierdo/wordnets/wordnet-3.0/dict/index.sense'
    fd = open(path_to_index_sense)
    for line in fd:
        fields = line.split()
        synset_for_skey[fields[0]] = fields[1]
    fd.close()

    naf_obj = KafNafParser(sys.stdin)

    for term in naf_obj.get_terms():
        this_skey = None
        this_synset = None
        ref_skey = ref_synset = None
        for ext_ref in term.get_external_references():
            if ext_ref.get_reftype() == 'sense':
                this_skey = ext_ref.get_reference()
                ref_skey = ext_ref
            if ext_ref.get_reftype() == 'ilidef':
                this_synset = ext_ref.get_reference()
                ref_synset = ext_ref

        if this_synset == '':
            print >> sys.stderr, term.get_id()
            if '%3:' in this_skey:
                this_skey = this_skey.replace('%3:', '%5:')
예제 #20
0
def load_naf_stdin():
    """Load a dataset in NAF format.

    Use this function to create a new ConlluDataset from a NAF file,
    read from stdin.

    NOTE: you can only add to NAF files, not create one from scratch.
    """
    my_parser = KafNafParser(sys.stdin)

    my_dataset = ConlluDataset()

    # a big look-up table: for any NAF id, return a hash with
    # {sent_id, token_id} in the ConlluDataset
    naf2conll_id = {}

    # collect the sentences in a hash, indexed by token_obj.get_sent()
    sentences = {}

    # iterate over the tokens to get: ID, FORM
    for token_obj in my_parser.get_tokens():
        # (string) identifier of the sentence
        sent_id = token_obj.get_sent()
        if sent_id in sentences:
            sentence = sentences[sent_id]
        else:
            sentence = Sentence(sent_id=sent_id)
            sentences[sent_id] = sentence

        # (string) number of the token in the sentence, starting at '1'
        token_id = '{}'.format(len(sentence) + 1)  # ID

        new_token = Token([
            token_id,  # ID
            token_obj.get_text(),  # FORM
            '_',  # LEMMA
            '_',  # UPOS
            '_',  # XPOS
            '_',  # FEATS
            '0',  # HEAD -> to be overwritten later
            'root',  # DEPREL -> to be overwritten later
            '_',  # DEPS
            '_'  # MISC
        ])

        sentence.add(new_token)

        # to match a NAF span to conll tokens, we need sent_id and token_id
        naf2conll_id[token_obj.get_id()] = {
            'sent_id': sent_id,
            'token_id': token_id
        }

    # iterate over the term to get: LEMMA, XPOS, UPOS, FEATS, sent_id, nafid
    for term_obj in my_parser.get_terms():
        # span
        # TODO: for now, assume terms map one-on-one on tokens
        nafid = term_obj.get_span().get_span_ids()
        if len(nafid) > 1:
            logging.error('Multi-word tokens not implemented yet.')
            return
        nafid = nafid[0]

        conllid = naf2conll_id[nafid]
        sent_id = conllid['sent_id']
        sentence = sentences[sent_id]

        token_id = conllid['token_id']
        token = sentence[token_id]

        # store the identifier of the NAF term on the token, so we can add
        # information to the NAF later.
        token.nafid = term_obj.get_id()

        token.LEMMA = term_obj.get_lemma()

        # NAF pos='' is in lower case, UD UPOS is upper case
        token.UPOS = term_obj.get_pos().upper()

        # naf: A(B,C) -> ud: A|B|C
        xpos = term_obj.get_morphofeat()
        if xpos:
            token.XPOS = xpos.replace('(', '|').replace(')',
                                                        '').replace(',', '|')
            if token.XPOS[-1] == '|':
                token.XPOS = token.XPOS[:-1]

        # look for an external reference containing FEATS
        for ext_ref in term_obj.get_external_references():
            if ext_ref.get_reftype() == 'FEATS':
                token.FEATS = ext_ref.get_reference()

        # to match NAF dependencies to conll tokens, we need sent_id and token_id
        naf2conll_id[term_obj.get_id()] = {
            'sent_id': sent_id,
            'token_id': token_id
        }

    # iterate over the dependencies to get: HEAD, DEPREL
    for dep_obj in my_parser.get_dependencies():
        # from
        conllid = naf2conll_id[dep_obj.get_from()]
        sent_id = conllid['sent_id']
        sentence = sentences[sent_id]

        token_id = conllid['token_id']
        token_from = sentence[token_id]

        # to
        conllid = naf2conll_id[dep_obj.get_to()]
        sent_id = conllid['sent_id']
        sentence = sentences[sent_id]

        token_id = conllid['token_id']
        token_to = sentence[token_id]

        # function
        depfunc = dep_obj.get_function()

        token_to.HEAD = token_from.ID
        token_to.DEPREL = depfunc

    # A final conversion of our list of sentences to a ConlluDataset
    for sent_id in sentences:
        sentence = sentences[sent_id]

        # construct the sentence.full_text
        raw_tokens = []
        for token in sentence:
            raw_tokens.append(token.FORM)
        sentence.full_text = ' '.join(raw_tokens)

        # add to the dataset
        my_dataset.add(sentence)

    my_dataset.naf2conll_id = naf2conll_id

    return my_dataset, my_parser
예제 #21
0
def find_terms(naf: KafNafParser, words: Sequence[str]) -> Iterable[Cterm]:
    """Find all terms whose lemma or word form is in the list of words"""
    for t in naf.get_terms():
        if t.get_lemma() in words or get_word(naf, t) in words:
            yield t
# with help from Ruben Izquierdo


from KafNafParserPy import KafNafParser
import re
import sys 
from collections import OrderedDict
import codecs

input = sys.stdin

my_parser = KafNafParser(input)
	 
### We first need a list of the predicates that we want to create feature vectors for
predicates = {} 
for term_obj in my_parser.get_terms():
	predicate = re.match("WW", term_obj.get_morphofeat())
	if predicate is not None:
		predicates[term_obj.get_id()] = term_obj.get_pos()
		#print term_obj.get_id(), term_obj.get_morphofeat(), term_obj.get_lemma()

# We need the dependencies to find out the structure of the argument patterns
# and also to know which verbs are auxiliary verbs and which ones are main verbs
dependencies = {} 
for dep_obj in my_parser.get_dependencies():
	relparts = dep_obj.get_function().split('/')
	rel_from = relparts[0]
	rel_to = relparts[1]
	dep_id = dep_obj.get_from() + '-' + dep_obj.get_to() 
	dependencies[dep_id] = dep_obj.get_function()
예제 #23
0
                        required=True)
    parser.add_argument('-ov',
                        dest='output_version',
                        help='Output WN version of the synsets',
                        required=True)
    parser.add_argument('-or',
                        dest='output_res_label',
                        help='Output resource label for synset references',
                        required=True)

    args = parser.parse_args()

    mapping = load_mapping(args.input_version, args.output_version)

    obj = KafNafParser(args.input_file)
    for term in obj.get_terms():
        source_synset = None
        for ext_ref in term.get_external_references():
            if ext_ref.get_resource(
            ) == args.input_res_label and ext_ref.get_reftype() == 'synset':
                source_synset = ext_ref.get_reference()
                break
        if source_synset is not None:
            fields = source_synset.split('-')
            this_synset = fields[1]
            short_pos = fields[2]
            if short_pos == 'a': this_pos = ADJ
            elif short_pos == 'n': this_pos = NOUN
            elif short_pos == 'r': this_pos = ADV
            elif short_pos == 'v': this_pos = VERB
            else: this_pos = None
예제 #24
0
def add_file(filename, data_lexelt, reftype='lexical_key'):
    obj = KafNafParser(filename)
    tokens_per_sent = {}
    sent_for_token = {}
    sents_in_order = []
    for token in obj.get_tokens():
        sentid = token.get_sent()
        if sentid not in sents_in_order:
            sents_in_order.append(sentid)
        sent_for_token[token.get_id()] = sentid
        if sentid not in tokens_per_sent: tokens_per_sent[sentid] = []
        tokens_per_sent[sentid].append((token.get_id(), token.get_text()))

    annotated_lemmas = []  # LIST of (full_id, token ids, lemma,pos,synset)
    for term in obj.get_terms():
        synset_label = None
        for ext_ref in term.get_external_references():
            if ext_ref.get_reftype() == 'lexical_key':
                synset_label = term.get_lemma() + '%' + ext_ref.get_reference()
            elif ext_ref.get_reftype() == 'sense' and ext_ref.get_resource(
            ) == 'WordNet-3.0':
                synset_label = ext_ref.get_reference()
            if synset_label is not None:
                break

        if synset_label is not None:
            annotated_lemmas.append(
                (filename + '#' + term.get_id(),
                 term.get_span().get_span_ids(), term.get_lemma(),
                 term.get_pos(), synset_label))

    for full_id, token_ids, lemma, pos, synset_label in annotated_lemmas:
        #CREATE NEW INSTANCE

        this_key = lemma + '.' + pos.lower()[0]
        if this_key not in data_lexelt:
            data_lexelt[this_key] = Clexelt(this_key, pos)

        if not data_lexelt[this_key].exists(full_id):
            #Create the new instance
            new_instance = Cinstance()
            new_instance.id = full_id
            new_instance.docsrc = filename
            new_instance.key = synset_label

            tokens = []
            target_indexes = []
            this_sent = sent_for_token[token_ids[0]]
            index = sents_in_order.index(this_sent)
            start_idx = max(index - 2, 0)
            end_idx = min(index + 2, len(sents_in_order) - 1)
            selected_sents = sents_in_order[start_idx:end_idx + 1]
            num_token = 0
            for current_sent in selected_sents:
                for token_id, token_text in tokens_per_sent[str(current_sent)]:
                    tokens.append(token_text)
                    if token_id in token_ids:
                        target_indexes.append(num_token)
                    num_token += 1

            new_instance.tokens = tokens[:]
            new_instance.index_head = target_indexes[:]
            data_lexelt[this_key].add_instance(new_instance)
def process_file(this_file, token_freq):
    xml_obj = KafNafParser(this_file)
    print >> sys.stderr, 'Processing file', this_file
    token_for_wid = {}
    order_for_wid = {}
    opinion_expressions = []
    opinion_targets = []
    whole_text = ' '
    for n, token in enumerate(xml_obj.get_tokens()):
        text = token.get_text().lower()
        token_freq[text] += 1
        token_for_wid[token.get_id()] = text
        order_for_wid[token.get_id()] = n
        whole_text += text + ' '
    wids_for_tid = {}
    lemma_for_wid = {}
    pos_for_wid = {}
    for term in xml_obj.get_terms():
        tid = term.get_id()
        wids = term.get_span().get_span_ids()
        wids_for_tid[tid] = wids
        for wid in wids:
            lemma_for_wid[wid] = term.get_lemma()
            pos_for_wid[wid] = term.get_pos()

    ##Properties!
    aspects = []  ## [(label,term_span)...]

    for property in xml_obj.get_properties():
        for refs in property.get_references():
            for span in refs:
                aspects.append((property.get_type(), span.get_span_ids()))

    already_counted = {EXP: set(), TAR: set()}

    for opinion in xml_obj.get_opinions():
        for this_type, opinion_obj in [(EXP, opinion.get_expression()),
                                       (TAR, opinion.get_target())]:
            if this_type is EXP and opinion_obj.get_polarity(
            ) == 'NON-OPINIONATED':
                continue
            if opinion_obj is not None:
                span = opinion_obj.get_span()
                if span is not None:
                    list_wids = []
                    for tid in span.get_span_ids():
                        list_wids.extend(wids_for_tid.get(tid, []))
                    list_wids.sort(key=lambda wid: order_for_wid[
                        wid])  ##Sorted according the the order of the tokens

                    string_wids = '#'.join(list_wids)
                    opinion_tokens = ' '.join(token_for_wid[wid]
                                              for wid in list_wids)
                    opinion_lemmas = ' '.join(lemma_for_wid[wid]
                                              for wid in list_wids)
                    opinion_pos = ' '.join(pos_for_wid[wid]
                                           for wid in list_wids)

                    if string_wids not in already_counted[this_type]:
                        if this_type == EXP:
                            polarity = (opinion_obj.get_polarity()).lower()
                            opinion_expressions.append(
                                (opinion_tokens, polarity, opinion_lemmas,
                                 opinion_pos))
                        else:
                            ##Calculate the aspect type
                            possible_aspects = []
                            target_ids = span.get_span_ids()
                            for aspect_label, aspect_span in aspects:
                                num_in_common = len(
                                    set(target_ids) & set(aspect_span))
                                if num_in_common != 0:
                                    possible_aspects.append(
                                        (aspect_label, num_in_common,
                                         len(aspect_span)))
                            aspect_for_target = 'unknown'

                            if len(possible_aspects) != 0:
                                ##Sorting by the number in common first, and by the lengtgh of the aspect secondly
                                aspect_for_target = sorted(possible_aspects,
                                                           key=lambda t:
                                                           (t[1], t[2]),
                                                           reverse=True)[0][0]
                            opinion_targets.append(
                                (opinion_tokens, aspect_for_target,
                                 opinion_lemmas, opinion_pos))
                        already_counted[this_type].add(string_wids)

    del xml_obj
    print >> sys.stderr, '\tNumber of opinion expressions:', len(
        opinion_expressions)
    print >> sys.stderr, '\tNumber of opinion targets:', len(opinion_targets)
    print >> sys.stderr, '\tNumber of characters of the text:', len(whole_text)
    return opinion_expressions, opinion_targets, whole_text