Esempio n. 1
0
    def convert(self, id, result, format):
        assert format == "csv"

        _int = lambda x: None if x is None else int(x)
        naf = KafNafParser(BytesIO(result.encode("utf-8")))

        deps = {dep.get_to(): (dep.get_function(), dep.get_from())
                for dep in naf.get_dependencies()}
        tokendict = {token.get_id(): token for token in naf.get_tokens()}

        s = StringIO()
        w = csv.writer(s)
        w.writerow(["id", "token_id", "offset", "sentence", "para", "word", "term_id",
                    "lemma", "pos", "pos1", "parent", "relation"])
        for term in naf.get_terms():
            tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
            for token in tokens:
                tid = term.get_id()
                pos = term.get_pos()
                pos1 = POSMAP[pos]
                row = [id,  token.get_id(), _int(token.get_offset()), _int(token.get_para()), token.get_text(),
                       tid, term.get_lemma(), pos, pos1]
                if tid in deps:
                    rel, parent = deps[tid]
                    row += [parent, rel.split("/")[-1]]
                else:
                    row += [None, None]
                w.writerow(row)
        return s.getvalue()
Esempio n. 2
0
def test_create_terms():
    """
    Can we create_terms via the create_{term,token} functions?
    """
    
    naf = KafNafParser(type="NAF")
    sent=1; offset=0
    input = [(u'dit', u'dit', u'O', u'VNW'),
             (u'is', u'zijn', u'V', u'WW'),
             (u'een', u'een', u'D', u'LID'),
             (u'test', u'test', u'N', u'N')]

    offset = 0
    for (word, lemma, pos, morph) in input:
        token = naf.create_wf(word, 1, offset)
        offset += len(word)
        term = naf.create_term(lemma, pos, morph, [token])

    tokens = {t.get_id(): t for t in naf.get_tokens()}
    assert_equal(len(tokens), 4)
    
    result = {}
    for term in naf.get_terms():
        for token_id in term.get_span().get_span_ids():
            token = tokens[token_id]
            result[term.get_id()] = (token.get_text(), term.get_lemma(),
                                     term.get_pos(), term.get_morphofeat())
    result = [result[tid] for tid in sorted(result.keys())]
    assert_equal(input, result)
Esempio n. 3
0
    def from_naf(self, article, naf):
        def _int(x):
            return None if x is None else int(x)
        naf = KafNafParser(BytesIO(naf.encode("utf-8")))

        deps = {dep.get_to(): (dep.get_function(), dep.get_from())
                for dep in naf.get_dependencies()}
        tokendict = {token.get_id(): token for token in naf.get_tokens()}

        for term in naf.get_terms():
            tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
            for token in tokens:
                tid = term.get_id()
                tok = {"aid": article,
                       "token_id": token.get_id(),
                       "offset": _int(token.get_offset()),
                       "sentence": _int(token.get_sent()),
                       "para": _int(token.get_para()),
                       "word": token.get_text(),
                       "term_id": tid,
                       "lemma": term.get_lemma(),
                       "pos": term.get_pos()}
                if tid in deps:
                    rel, parent = deps[tid]
                    tok['parent'] = parent
                    tok['relation'] = rel.split("/")[-1]
                yield tok
    def process_single_file(self,file):
        try:
            xml_obj = KafNafParser(file)
        except:
            print>>sys.stderr,'Error parsing',file,': skipped'
            return        


        print>>sys.stderr,'Processing file', os.path.basename(file), 'Type:',xml_obj.get_type()
        self.langs[xml_obj.get_language()] += 1
        sentences = []
        current_sent = []
        this_sent = None
                 
                 
               
         
        pos_for_wid = {} ## For each token id (wid) the pos of it
        for term in xml_obj.get_terms():
            w_ids = term.get_span().get_span_ids()
            pos = term.get_pos()
            for wid in term.get_span().get_span_ids():
                pos_for_wid[wid] = pos

            
        for token in xml_obj.get_tokens():
            wid = token.get_id()
            value = token.get_text()
            if self.convert_to_lowercase:
                value = value.lower()
                
            if value in self.punctuation:
                value = 'PUN'
                
            if value == '*':
                value = 'STAR'
            
            sentence = token.get_sent()
            if this_sent is not None and sentence != this_sent:  ## There is a new sent
                sentences.append(current_sent)
                current_sent = []
            current_sent.append((wid,value))
            this_sent = sentence
        ## Add the last sentence as well
        sentences.append(current_sent)
        
        for sentence in sentences:
            if self.include_sentence_delimiters:
                sentence.insert(0,('xxx','<S>'))
                sentence.append(('xxx','</S>'))
        
            for idx in range(0,len(sentence)):
                for ngramlen in range(self.min_ngram_len,self.max_ngram_len+1):
                    file_desc = self.get_file_desc_for_ngram(ngramlen)
                    start = idx
                    end = start + ngramlen
                    if end <= len(sentence):
                        this_ngram = '\t'.join(value for wid, value in sentence[start:end])
                        this_ngram_pos = '\t'.join(pos_for_wid.get(wid,'X') for wid, value in sentence[start:end])
                        file_desc.write(this_ngram.encode('utf-8')+'\t'+DELIMITER+'\t'+this_ngram_pos+'\n')
def test_create_terms():
    """
    Can we create_terms via the create_{term,token} functions?
    """

    naf = KafNafParser(type="NAF")
    sent = 1
    offset = 0
    input = [(u'dit', u'dit', u'O', u'VNW'), (u'is', u'zijn', u'V', u'WW'),
             (u'een', u'een', u'D', u'LID'), (u'test', u'test', u'N', u'N')]

    offset = 0
    for (word, lemma, pos, morph) in input:
        token = naf.create_wf(word, 1, offset)
        offset += len(word)
        term = naf.create_term(lemma, pos, morph, [token])

    tokens = {t.get_id(): t for t in naf.get_tokens()}
    assert_equal(len(tokens), 4)

    result = {}
    for term in naf.get_terms():
        for token_id in term.get_span().get_span_ids():
            token = tokens[token_id]
            result[term.get_id()] = (token.get_text(), term.get_lemma(),
                                     term.get_pos(), term.get_morphofeat())
    result = [result[tid] for tid in sorted(result.keys())]
    assert_equal(input, result)
Esempio n. 6
0
File: nlpipe.py Progetto: BBie/amcat
    def from_naf(self, naf):
        naf = KafNafParser(BytesIO(naf.encode("utf-8")))
        tokendict = {token.get_id(): token for token in naf.get_tokens()}

        for term in naf.get_terms():
            tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
            for token in tokens:
                yield {"aid": article.pk,
                       "token_id": token.get_id(),
                       "offset": token.get_offset(),
                       "sentence": token.get_sent(),
                       "para": token.get_para(),
                       "word": token.get_text(),
                       "term_id": term.get_id(),
                       "lemma": term.get_lemma(),
                       "pos": term.get_pos()}
Esempio n. 7
0
def read_training_data(file_name):
    """
    read kaf/naf and matches the aspects with the words
    """
    parser = KafNafParser(PATH_ANNOTATED_DATA + file_name)
    terms = list(parser.get_terms())
    #    create token dictionairy containing naf info
    tokens_container = dict()
    for token_el in parser.get_tokens():
        token_node = token_el.node
        token_id = token_node.get('wid').replace('w', 't')
        token_info = token_node.attrib
        tokens_container[token_id] = token_info
    properties = list(parser.get_properties())
    handled_properties, term_dict = handle_properties(properties, terms,
                                                      tokens_container)
    return terms, properties, handled_properties, term_dict, tokens_container
Esempio n. 8
0
 def convert(self, id, result, format):
     assert format == "csv"
     naf = KafNafParser(BytesIO(result.encode("utf-8")))
     memo = self._csv_memo(naf)
     tokendict = {token.get_id(): token for token in naf.get_tokens()}
     s = StringIO()
     w = csv.writer(s)
     w.writerow(self._csv_header())
     for term in naf.get_terms():
         tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
         for token in tokens:
             tid = term.get_id()
             pos = term.get_pos()
             pos1 = POSMAP[pos]
             row = [id] + list(self._csv_row(memo, term, token))
             w.writerow(row)
     return s.getvalue()
def process_file(this_file,token_freq):
    xml_obj = KafNafParser(this_file)
    print>>sys.stderr,'Processing file',this_file
    token_for_wid = {}
    order_for_wid = {}
    opinion_expressions = []
    opinion_targets = []
    whole_text = ' '
    for n, token in enumerate(xml_obj.get_tokens()):
        text = token.get_text().lower()
        token_freq[text] += 1
        token_for_wid[token.get_id()] = text
        order_for_wid[token.get_id()] = n
        whole_text += text + ' '
    wids_for_tid = {}
    lemma_for_wid = {}
    pos_for_wid = {}
    for term in xml_obj.get_terms():
        tid = term.get_id()
        wids = term.get_span().get_span_ids()
        wids_for_tid[tid] = wids
        for wid in wids:
            lemma_for_wid[wid] = term.get_lemma()
            pos_for_wid[wid] = term.get_pos()
        
    
    ##Properties!
    aspects = [] ## [(label,term_span)...]
    
    for property in xml_obj.get_properties():
        for refs in property.get_references():
            for span in refs:
                aspects.append((property.get_type(),span.get_span_ids()))
       
    
    
    already_counted = {EXP:set(), TAR:set()}
    
    for opinion in xml_obj.get_opinions():   
        for this_type, opinion_obj in [(EXP,opinion.get_expression()),(TAR,opinion.get_target())]:
            if this_type is EXP and opinion_obj.get_polarity()=='NON-OPINIONATED':
                continue
            if opinion_obj is not None:
                span = opinion_obj.get_span()
                if span is not None:
                    list_wids = []
                    for tid in span.get_span_ids():
                        list_wids.extend(wids_for_tid.get(tid,[]))
                    list_wids.sort(key=lambda wid: order_for_wid[wid])  ##Sorted according the the order of the tokens
                    
                    string_wids = '#'.join(list_wids)
                    opinion_tokens = ' '.join( token_for_wid[wid] for wid in list_wids)
                    opinion_lemmas = ' '.join( lemma_for_wid[wid] for wid in list_wids)
                    opinion_pos    = ' '.join( pos_for_wid[wid]   for wid in list_wids)
                    
                   
                    if string_wids not in already_counted[this_type]:
                        if this_type == EXP:
                            polarity = (opinion_obj.get_polarity()).lower()
                            opinion_expressions.append((opinion_tokens,polarity,opinion_lemmas,opinion_pos))
                        else:
                            ##Calculate the aspect type
                            possible_aspects = []
                            target_ids = span.get_span_ids()
                            for aspect_label, aspect_span in aspects:
                                num_in_common = len(set(target_ids) & set(aspect_span))
                                if num_in_common != 0:
                                    possible_aspects.append((aspect_label,num_in_common,len(aspect_span)))
                            aspect_for_target = 'unknown'

                            if len(possible_aspects) != 0:
                                ##Sorting by the number in common first, and by the lengtgh of the aspect secondly
                                aspect_for_target = sorted(possible_aspects,key=lambda t: (t[1],t[2]), reverse=True)[0][0]
                            opinion_targets.append((opinion_tokens,aspect_for_target, opinion_lemmas,opinion_pos))
                        already_counted[this_type].add(string_wids)    
      
    del xml_obj
    print>>sys.stderr,'\tNumber of opinion expressions:',len(opinion_expressions)
    print>>sys.stderr,'\tNumber of opinion targets:',len(opinion_targets)
    print>>sys.stderr,'\tNumber of characters of the text:',len(whole_text)
    return opinion_expressions, opinion_targets, whole_text
Esempio n. 10
0
def create_training_sentences(folder_tag_in, folder_kaf_in, opinion_layers,
                              non_opinion, folder_out):
    #Remove the outputfolder if exists and create it again
    if os.path.exists(folder_out):
        shutil.rmtree(folder_out)
    os.mkdir(folder_out)
    total_sents_opi = total_sents_no_opi = 0

    for tag_file in glob.glob(os.path.join(folder_tag_in, '*.tag')):
        basename = os.path.basename(tag_file).replace('.tag', '')
        kaf_file = os.path.join(folder_kaf_in, basename + '.kaf')
        if os.path.exists(kaf_file):
            ##From the tag file we extract the token ids for opinions and for non opinionated
            opinion_wids = set()  #token ids annotated as opinions
            no_opinion_wids = set()  #token ids annotated as no opinions

            fd = open(tag_file, 'rb')
            for line in fd:
                fields = line.strip().split('\t')
                wid = fields[0]
                for opinion_idx in opinion_layers:
                    if fields[opinion_idx] == 'Opinion':
                        opinion_wids.add(wid)

                    if non_opinion is not None and fields[
                            non_opinion] == 'NON-OPINIONATED':
                        no_opinion_wids.add(wid)

            fd.close()
            #########

            ###
            # Obtain the sentences that are opinionated (positive) and not (negative)
            # The negatives are:
            # If there are non-opinionated:  just the non opinionated
            # If not --> all the rest that are not positive
            #####
            sentences = {}
            all_sent_ids = set()
            sent_for_token_id = {}
            kaf_obj = KafNafParser(kaf_file)
            for token in kaf_obj.get_tokens():
                token_id = token.get_id()
                sent_id = token.get_sent()
                token_value = token.get_text()

                if sent_id not in sentences:
                    sentences[sent_id] = []
                sentences[sent_id].append(token_value)

                all_sent_ids.add(sent_id)

                sent_for_token_id[token_id] = sent_id
            ###

            positive_sents = set()
            negative_sents = set()

            ##Positive sents are the sentences for the opinion_ids
            for token_id in opinion_wids:
                positive_sents.add(sent_for_token_id[token_id])
            ####

            #Negative sents
            if non_opinion is not None:
                #In this case the negative are just the sentence of the no_opinion_wids
                for token_id in no_opinion_wids:
                    negative_sents.add(sent_for_token_id[token_id])
            else:
                #In this case the negative are all the sentences but the positive ones
                negative_sents = all_sent_ids - positive_sents

            #Free some memory
            del opinion_wids
            del no_opinion_wids
            del kaf_obj

            ##Store the results in the file
            output_file = os.path.join(folder_out, basename + '.sents')
            fd_out = open(output_file, 'w')
            fd_out.write('#' + tag_file + '\n')
            for sent_id in sorted(list(positive_sents)):
                text = ' '.join(sentences[sent_id])
                fd_out.write('+ ' + text.encode('utf-8') + '\n')

            for sent_id in sorted(list(negative_sents)):
                text = ' '.join(sentences[sent_id])
                fd_out.write('- ' + text.encode('utf-8') + '\n')
            fd_out.close()

            #print 'Processed ',basename
            #print '   Subjective sents:',len(positive_sents)
            #print '   Non subje. sents:',len(negative_sents)
            total_sents_opi += len(positive_sents)
            total_sents_no_opi += len(negative_sents)
        else:
            print 'KAF FILE NOT FOUND', kaf_file
    return total_sents_opi, total_sents_no_opi
		treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-spanish'
		mapping_file = this_folder +'/mappings/spanish.map.treetagger.kaf.csv'
		model = 'Spanish models'
	else: ## Default is dutch
		print>>sys.stderr,'Language',my_lang,'not supported by this wrapper'
		sys.exit(0)

	map_tt_to_kaf = loadMapping(mapping_file)


	## Create the input text for
	reference_tokens = []
	sentences = []
	prev_sent='-200'
	aux = []
	for token in input_obj.get_tokens():
		sent_id = token.get_sent()
		word = token.get_text()
		w_id = token.get_id()
		if sent_id != prev_sent:
			if len(aux) != 0:
				sentences.append(aux)
				aux = []
		aux.append((word,w_id))

		prev_sent = sent_id
	if len(aux)!=0:
		sentences.append(aux)


	num_term = 0 
Esempio n. 12
0
def get_terms_in_sentence(naf: KafNafParser, sent: int) -> Iterable[Cterm]:
    tokens = sort_tokens(t for t in naf.get_tokens() if t.get_sent() == sent)
    tokenids = [t.get_id() for t in tokens]
    return sort_terms(
        naf, [naf.get_term(tid) for tid in naf.map_tokens_to_terms(tokenids)])
def process_file(this_file, token_freq):
    xml_obj = KafNafParser(this_file)
    print >> sys.stderr, 'Processing file', this_file
    token_for_wid = {}
    order_for_wid = {}
    opinion_expressions = []
    opinion_targets = []
    whole_text = ' '
    for n, token in enumerate(xml_obj.get_tokens()):
        text = token.get_text().lower()
        token_freq[text] += 1
        token_for_wid[token.get_id()] = text
        order_for_wid[token.get_id()] = n
        whole_text += text + ' '
    wids_for_tid = {}
    lemma_for_wid = {}
    pos_for_wid = {}
    for term in xml_obj.get_terms():
        tid = term.get_id()
        wids = term.get_span().get_span_ids()
        wids_for_tid[tid] = wids
        for wid in wids:
            lemma_for_wid[wid] = term.get_lemma()
            pos_for_wid[wid] = term.get_pos()

    ##Properties!
    aspects = []  ## [(label,term_span)...]

    for property in xml_obj.get_properties():
        for refs in property.get_references():
            for span in refs:
                aspects.append((property.get_type(), span.get_span_ids()))

    already_counted = {EXP: set(), TAR: set()}

    for opinion in xml_obj.get_opinions():
        for this_type, opinion_obj in [(EXP, opinion.get_expression()),
                                       (TAR, opinion.get_target())]:
            if this_type is EXP and opinion_obj.get_polarity(
            ) == 'NON-OPINIONATED':
                continue
            if opinion_obj is not None:
                span = opinion_obj.get_span()
                if span is not None:
                    list_wids = []
                    for tid in span.get_span_ids():
                        list_wids.extend(wids_for_tid.get(tid, []))
                    list_wids.sort(key=lambda wid: order_for_wid[
                        wid])  ##Sorted according the the order of the tokens

                    string_wids = '#'.join(list_wids)
                    opinion_tokens = ' '.join(token_for_wid[wid]
                                              for wid in list_wids)
                    opinion_lemmas = ' '.join(lemma_for_wid[wid]
                                              for wid in list_wids)
                    opinion_pos = ' '.join(pos_for_wid[wid]
                                           for wid in list_wids)

                    if string_wids not in already_counted[this_type]:
                        if this_type == EXP:
                            polarity = (opinion_obj.get_polarity()).lower()
                            opinion_expressions.append(
                                (opinion_tokens, polarity, opinion_lemmas,
                                 opinion_pos))
                        else:
                            ##Calculate the aspect type
                            possible_aspects = []
                            target_ids = span.get_span_ids()
                            for aspect_label, aspect_span in aspects:
                                num_in_common = len(
                                    set(target_ids) & set(aspect_span))
                                if num_in_common != 0:
                                    possible_aspects.append(
                                        (aspect_label, num_in_common,
                                         len(aspect_span)))
                            aspect_for_target = 'unknown'

                            if len(possible_aspects) != 0:
                                ##Sorting by the number in common first, and by the lengtgh of the aspect secondly
                                aspect_for_target = sorted(possible_aspects,
                                                           key=lambda t:
                                                           (t[1], t[2]),
                                                           reverse=True)[0][0]
                            opinion_targets.append(
                                (opinion_tokens, aspect_for_target,
                                 opinion_lemmas, opinion_pos))
                        already_counted[this_type].add(string_wids)

    del xml_obj
    print >> sys.stderr, '\tNumber of opinion expressions:', len(
        opinion_expressions)
    print >> sys.stderr, '\tNumber of opinion targets:', len(opinion_targets)
    print >> sys.stderr, '\tNumber of characters of the text:', len(whole_text)
    return opinion_expressions, opinion_targets, whole_text
Esempio n. 14
0
def load_naf_stdin():
    """Load a dataset in NAF format.

    Use this function to create a new ConlluDataset from a NAF file,
    read from stdin.

    NOTE: you can only add to NAF files, not create one from scratch.
    """
    my_parser = KafNafParser(sys.stdin)

    my_dataset = ConlluDataset()

    # a big look-up table: for any NAF id, return a hash with
    # {sent_id, token_id} in the ConlluDataset
    naf2conll_id = {}

    # collect the sentences in a hash, indexed by token_obj.get_sent()
    sentences = {}

    # iterate over the tokens to get: ID, FORM
    for token_obj in my_parser.get_tokens():
        # (string) identifier of the sentence
        sent_id = token_obj.get_sent()
        if sent_id in sentences:
            sentence = sentences[sent_id]
        else:
            sentence = Sentence(sent_id=sent_id)
            sentences[sent_id] = sentence

        # (string) number of the token in the sentence, starting at '1'
        token_id = '{}'.format(len(sentence) + 1)  # ID

        new_token = Token([
            token_id,  # ID
            token_obj.get_text(),  # FORM
            '_',  # LEMMA
            '_',  # UPOS
            '_',  # XPOS
            '_',  # FEATS
            '0',  # HEAD -> to be overwritten later
            'root',  # DEPREL -> to be overwritten later
            '_',  # DEPS
            '_'  # MISC
        ])

        sentence.add(new_token)

        # to match a NAF span to conll tokens, we need sent_id and token_id
        naf2conll_id[token_obj.get_id()] = {
            'sent_id': sent_id,
            'token_id': token_id
        }

    # iterate over the term to get: LEMMA, XPOS, UPOS, FEATS, sent_id, nafid
    for term_obj in my_parser.get_terms():
        # span
        # TODO: for now, assume terms map one-on-one on tokens
        nafid = term_obj.get_span().get_span_ids()
        if len(nafid) > 1:
            logging.error('Multi-word tokens not implemented yet.')
            return
        nafid = nafid[0]

        conllid = naf2conll_id[nafid]
        sent_id = conllid['sent_id']
        sentence = sentences[sent_id]

        token_id = conllid['token_id']
        token = sentence[token_id]

        # store the identifier of the NAF term on the token, so we can add
        # information to the NAF later.
        token.nafid = term_obj.get_id()

        token.LEMMA = term_obj.get_lemma()

        # NAF pos='' is in lower case, UD UPOS is upper case
        token.UPOS = term_obj.get_pos().upper()

        # naf: A(B,C) -> ud: A|B|C
        xpos = term_obj.get_morphofeat()
        if xpos:
            token.XPOS = xpos.replace('(', '|').replace(')',
                                                        '').replace(',', '|')
            if token.XPOS[-1] == '|':
                token.XPOS = token.XPOS[:-1]

        # look for an external reference containing FEATS
        for ext_ref in term_obj.get_external_references():
            if ext_ref.get_reftype() == 'FEATS':
                token.FEATS = ext_ref.get_reference()

        # to match NAF dependencies to conll tokens, we need sent_id and token_id
        naf2conll_id[term_obj.get_id()] = {
            'sent_id': sent_id,
            'token_id': token_id
        }

    # iterate over the dependencies to get: HEAD, DEPREL
    for dep_obj in my_parser.get_dependencies():
        # from
        conllid = naf2conll_id[dep_obj.get_from()]
        sent_id = conllid['sent_id']
        sentence = sentences[sent_id]

        token_id = conllid['token_id']
        token_from = sentence[token_id]

        # to
        conllid = naf2conll_id[dep_obj.get_to()]
        sent_id = conllid['sent_id']
        sentence = sentences[sent_id]

        token_id = conllid['token_id']
        token_to = sentence[token_id]

        # function
        depfunc = dep_obj.get_function()

        token_to.HEAD = token_from.ID
        token_to.DEPREL = depfunc

    # A final conversion of our list of sentences to a ConlluDataset
    for sent_id in sentences:
        sentence = sentences[sent_id]

        # construct the sentence.full_text
        raw_tokens = []
        for token in sentence:
            raw_tokens.append(token.FORM)
        sentence.full_text = ' '.join(raw_tokens)

        # add to the dataset
        my_dataset.add(sentence)

    my_dataset.naf2conll_id = naf2conll_id

    return my_dataset, my_parser
def create_training_sentences(folder_tag_in,folder_kaf_in, opinion_layers,non_opinion,folder_out):
    #Remove the outputfolder if exists and create it again
    if os.path.exists(folder_out):
        shutil.rmtree(folder_out)
    os.mkdir(folder_out)
    total_sents_opi = total_sents_no_opi = 0
        
    for tag_file in glob.glob(os.path.join(folder_tag_in,'*.tag')):
        basename = os.path.basename(tag_file).replace('.tag','')
        kaf_file = os.path.join(folder_kaf_in,basename+'.kaf')
        if os.path.exists(kaf_file):
            ##From the tag file we extract the token ids for opinions and for non opinionated
            opinion_wids = set()        #token ids annotated as opinions
            no_opinion_wids = set()     #token ids annotated as no opinions
            
            fd = open(tag_file,'rb')
            for line in fd:
                fields = line.strip().split('\t')
                wid = fields[0]
                for opinion_idx in opinion_layers:
                    if fields[opinion_idx] == 'Opinion':
                        opinion_wids.add(wid)
                    
                    if non_opinion is not None and fields[non_opinion] == 'NON-OPINIONATED':
                        no_opinion_wids.add(wid)
                        
            fd.close()
            #########
            
            ###
            # Obtain the sentences that are opinionated (positive) and not (negative)
            # The negatives are:
            # If there are non-opinionated:  just the non opinionated
            # If not --> all the rest that are not positive
            #####
            sentences = {}
            all_sent_ids = set()
            sent_for_token_id = {}
            kaf_obj = KafNafParser(kaf_file)
            for token in kaf_obj.get_tokens():
                token_id = token.get_id()
                sent_id = token.get_sent()
                token_value = token.get_text()
                
                if sent_id not in sentences:
                    sentences[sent_id] = []
                sentences[sent_id].append(token_value)
                
                all_sent_ids.add(sent_id)
                
                sent_for_token_id[token_id] = sent_id
            ###
            
            positive_sents = set()
            negative_sents = set()
            
            ##Positive sents are the sentences for the opinion_ids
            for token_id in opinion_wids:
                positive_sents.add(sent_for_token_id[token_id])
            ####
            
            #Negative sents
            if non_opinion is not None:
                #In this case the negative are just the sentence of the no_opinion_wids
                for token_id in no_opinion_wids:
                    negative_sents.add(sent_for_token_id[token_id])
            else:
                #In this case the negative are all the sentences but the positive ones
                negative_sents = all_sent_ids - positive_sents
                
            #Free some memory    
            del opinion_wids
            del no_opinion_wids
            del kaf_obj
            
            ##Store the results in the file
            output_file = os.path.join(folder_out,basename+'.sents')
            fd_out = open(output_file,'w')
            fd_out.write('#'+tag_file+'\n')
            for sent_id in sorted(list(positive_sents)):
                text = ' '.join(sentences[sent_id])
                fd_out.write('+ '+text.encode('utf-8')+'\n')
                
            for sent_id in sorted(list(negative_sents)):
                text = ' '.join(sentences[sent_id])
                fd_out.write('- '+text.encode('utf-8')+'\n')
            fd_out.close()
            
            #print 'Processed ',basename
            #print '   Subjective sents:',len(positive_sents)
            #print '   Non subje. sents:',len(negative_sents)
            total_sents_opi += len(positive_sents)
            total_sents_no_opi += len(negative_sents)
        else:
            print 'KAF FILE NOT FOUND',kaf_file
    return total_sents_opi, total_sents_no_opi
Esempio n. 16
0
def add_file(filename, data_lexelt, reftype='lexical_key'):
    obj = KafNafParser(filename)
    tokens_per_sent = {}
    sent_for_token = {}
    sents_in_order = []
    for token in obj.get_tokens():
        sentid = token.get_sent()
        if sentid not in sents_in_order:
            sents_in_order.append(sentid)
        sent_for_token[token.get_id()] = sentid
        if sentid not in tokens_per_sent: tokens_per_sent[sentid] = []
        tokens_per_sent[sentid].append((token.get_id(), token.get_text()))

    annotated_lemmas = []  # LIST of (full_id, token ids, lemma,pos,synset)
    for term in obj.get_terms():
        synset_label = None
        for ext_ref in term.get_external_references():
            if ext_ref.get_reftype() == 'lexical_key':
                synset_label = term.get_lemma() + '%' + ext_ref.get_reference()
            elif ext_ref.get_reftype() == 'sense' and ext_ref.get_resource(
            ) == 'WordNet-3.0':
                synset_label = ext_ref.get_reference()
            if synset_label is not None:
                break

        if synset_label is not None:
            annotated_lemmas.append(
                (filename + '#' + term.get_id(),
                 term.get_span().get_span_ids(), term.get_lemma(),
                 term.get_pos(), synset_label))

    for full_id, token_ids, lemma, pos, synset_label in annotated_lemmas:
        #CREATE NEW INSTANCE

        this_key = lemma + '.' + pos.lower()[0]
        if this_key not in data_lexelt:
            data_lexelt[this_key] = Clexelt(this_key, pos)

        if not data_lexelt[this_key].exists(full_id):
            #Create the new instance
            new_instance = Cinstance()
            new_instance.id = full_id
            new_instance.docsrc = filename
            new_instance.key = synset_label

            tokens = []
            target_indexes = []
            this_sent = sent_for_token[token_ids[0]]
            index = sents_in_order.index(this_sent)
            start_idx = max(index - 2, 0)
            end_idx = min(index + 2, len(sents_in_order) - 1)
            selected_sents = sents_in_order[start_idx:end_idx + 1]
            num_token = 0
            for current_sent in selected_sents:
                for token_id, token_text in tokens_per_sent[str(current_sent)]:
                    tokens.append(token_text)
                    if token_id in token_ids:
                        target_indexes.append(num_token)
                    num_token += 1

            new_instance.tokens = tokens[:]
            new_instance.index_head = target_indexes[:]
            data_lexelt[this_key].add_instance(new_instance)
Esempio n. 17
0
    def process_single_file(self, file):
        try:
            xml_obj = KafNafParser(file)
        except:
            print >> sys.stderr, 'Error parsing', file, ': skipped'
            return

        print >> sys.stderr, 'Processing file', os.path.basename(
            file), 'Type:', xml_obj.get_type()
        self.langs[xml_obj.get_language()] += 1
        sentences = []
        current_sent = []
        this_sent = None

        pos_for_wid = {}  ## For each token id (wid) the pos of it
        for term in xml_obj.get_terms():
            w_ids = term.get_span().get_span_ids()
            pos = term.get_pos()
            for wid in term.get_span().get_span_ids():
                pos_for_wid[wid] = pos

        for token in xml_obj.get_tokens():
            wid = token.get_id()
            value = token.get_text()
            if self.convert_to_lowercase:
                value = value.lower()

            if value in self.punctuation:
                value = 'PUN'

            if value == '*':
                value = 'STAR'

            sentence = token.get_sent()
            if this_sent is not None and sentence != this_sent:  ## There is a new sent
                sentences.append(current_sent)
                current_sent = []
            current_sent.append((wid, value))
            this_sent = sentence
        ## Add the last sentence as well
        sentences.append(current_sent)

        for sentence in sentences:
            if self.include_sentence_delimiters:
                sentence.insert(0, ('xxx', '<S>'))
                sentence.append(('xxx', '</S>'))

            for idx in range(0, len(sentence)):
                for ngramlen in range(self.min_ngram_len,
                                      self.max_ngram_len + 1):
                    file_desc = self.get_file_desc_for_ngram(ngramlen)
                    start = idx
                    end = start + ngramlen
                    if end <= len(sentence):
                        this_ngram = '\t'.join(
                            value for wid, value in sentence[start:end])
                        this_ngram_pos = '\t'.join(
                            pos_for_wid.get(wid, 'X')
                            for wid, value in sentence[start:end])
                        file_desc.write(
                            this_ngram.encode('utf-8') + '\t' + DELIMITER +
                            '\t' + this_ngram_pos + '\n')