Ejemplo n.º 1
0
def process_input_phrase(fname):
    """ label file split into data and label """
    content = utils.readFileEncode(fname, 'utf8')
    lines = content.split('\n')[:-1]

    sentences, phrases, labels = [], [], []
    phrase, label, text = {}, [], []
    oldtype, oldoffset, firstoffset, thislabel, thisrole = '', 0, 0, 0, ''
    for i in range(len(lines)):
        if len(lines[i]) > 3:
            words = lines[i].split('\t')
            #select only samples that were labeled as in ArgumentList3

            if words[2][2:] in ArgumentList3:
                if words[2].startswith('B-'):
                    if text:
                        phrase={'surface':" ".join(text),'entitylabel':thislabel,'headentity':text[-1],\
                                'offset':oldoffset,'firstoffset':firstoffset}
                        phrases.append(phrase)
                        label.append(thisrole)
                    text = []
                    text.append(words[0])
                    firstoffset = int(words[1])
                    thislabel = words[2][2:]
                    thisrole = words[3][2:]
                elif words[2].startswith('I-'):
                    if words[2][2:] == oldtype[2:]:
                        text.append(words[0])
                    elif words[2][2:] != oldtype[2:]:
                        if text:
                            phrase={'surface':" ".join(text),'entitylabel':thislabel,'headentity':text[-1], \
                                    'offset':oldoffset,'firstoffset':firstoffset}
                            phrases.append(phrase)
                            label.append(thisrole)
                        text = []
                        text.append(words[0])
                        firstoffset = int(words[1])
                        thislabel = words[2][2:]
                        thisrole = words[3][2:]
                oldoffset = int(words[1])
            oldtype = words[2]

        else:
            if text:
                phrase={'surface':" ".join(text),'entitylabel':thislabel,'headentity':text[-1], \
                        'offset':oldoffset,'firstoffset':firstoffset}
                phrases.append(phrase)
                label.append(thisrole)
                text = []

            if len(phrases) > 0 and len(label) > 0:
                sentences.append(phrases)
                labels.append(label)
                phrases = []
                label = []
            elif len(phrases) == 0 and i < len(lines) - 1:
                sentences.append([])
                labels.append([])

    return sentences, labels
Ejemplo n.º 2
0
def main():
    args=parser.parse_args()

    content=utils.readFileEncode(args.predictedfile,'utf8')
    lines=content.split('\n')[:-1]

    if args.metric=='f1':
        gold, predicted, selectedlist, raw = collect(lines, args.options)
        mention(gold,predicted,selectedlist,args.O)
    elif args.metric=='confusion_role':
        confusion_role(lines)
    elif args.metric=='confusion_token':
        gold, predicted, selectedlist, raw = collectlabels(lines, args.options)
        confusion_token(gold,predicted,selectedlist,raw)    
    elif args.metric=='confusion_label':
        confusion_label(gold,predicted,selectedlist,raw)    
    print ('=========================================')
Ejemplo n.º 3
0
def process_input(fname, onlynugget, onlyarg):
    """ label file split into data and label
        Input:  filename-list of file to be processed
                onlynugget-set to true if detect nuggets
                onlyarg-set to true if detect arguments
        Output: sentence-for each sentence is list of dict of surface word of all files [[{'originalText': ,}]]
                label-list of label
    """
    content = utils.readFileEncode(fname, 'utf8')
    lines = content.split('\n')[:-1]
    sentences = []
    labels = []
    sent = []
    label = []
    for i in range(len(lines)):
        if len(lines[i]) > 3:
            words = lines[i].split('\t')
            word = {'originalText': words[0], 'offset': int(words[1])}
            sent.append(word)
            if onlynugget:
                if words[2] in NuggetList10:
                    label.append(words[2])
                else:
                    label.append('O')
            elif onlyarg:
                if words[2] in ArgumentList:

                    if 'Software' in words[2]:
                        label.append(words[2][0:2] + 'System')
                    else:
                        label.append(words[2])
                else:
                    label.append('O')
        else:
            if len(sent) > 0 and len(label) > 0:
                sentences.append(sent)
                labels.append(label)
                sent = []
                label = []
            elif len(sent) == 0 and i < len(lines) - 1:
                sentences.append([])
                labels.append([])

    return sentences, labels
Ejemplo n.º 4
0
def process_input(fname, onlytrigger, onlyarg):
    """ label file split into data and label """
    content = utils.readFileEncode(fname, 'utf8')
    lines = content.split('\n')[:-1]
    sentences = []
    labels = []
    sent = []
    label = []

    for i in range(len(lines)):
        if len(lines[i]) > 3:
            words = lines[i].split('\t')
            word = {'originalText': words[0]}
            sent.append(word)

            if onlytrigger:
                if words[2] in NuggetList10:
                    label.append(words[2])
                else:
                    label.append('O')
            elif onlyarg:
                if words[2] in ArgumentList:
                    if 'Software' in words[2]:
                        label.append(words[2][0:2] + 'System')
                    else:
                        label.append(words[2])
                else:
                    label.append('O')

        else:

            if len(sent) > 0 and len(label) > 0:
                sentences.append(sent)
                labels.append(label)
                sent = []
                label = []

            elif len(sent) == 0 and i < len(lines) - 1:
                sentences.append([])
                labels.append([])

    return sentences, labels
Ejemplo n.º 5
0
def realis_to_ann(dir, result):
    for fileno in result.keys():
        aid = 1
        annfile = dir + fileno + '_pred.ann'
        content = utils.readFileEncode(annfile, 'utf8')
        token, event, relationlist, attrlist = ann2xml.readAnn(content)
        txtfile = dir + fileno + '.txt'
        head = cuthead(txtfile)
        f = codecs.open(annfile, 'a', 'utf8')
        for eventid in event.keys():
            triggerid = event[eventid]['triggertokenid']
            annoffset = int(token[triggerid]['startOffset']) - head
            for i in range(len(result[fileno])):
                for k in result[fileno][i]['offset']:
                    if k == annoffset:
                        f.write('A' + str(aid) + '\t' + 'Realis' + ' ' +
                                eventid + ' ' + result[fileno][i]['pred'] +
                                '\n')
                        aid += 1
                        break
        f.close()
Ejemplo n.º 6
0
def process_input_phrase(fname, labeloption):
    content = utils.readFileEncode(fname, 'utf8')
    lines = content.split('\n')[:-1]

    sentences, phrases, labels = [], [], []
    phrase, label, text = {}, [], []
    oldtype, offsets, oldevent = '', [], ''
    for i in range(len(lines)):
        if len(lines[i]) > 3:
            words = lines[i].split('\t')
            # select only samples that were labeled as in ArgumentList3
            if words[2] in EventList:
                if words[2].startswith('B-'):
                    if text:
                        phrase = {
                            'surface': " ".join(text),
                            'realislabel': oldtype,
                            'offset': offsets,
                            'eventtype': oldevent
                        }
                        if labeloption == 1:  #generic vs specific
                            if oldtype == 'Other' or oldtype == 'Actual':
                                label.append("NotGeneric")
                            else:
                                label.append("Generic")
                            phrases.append(phrase)
                        elif labeloption == 2:  # not general -> actual vs other
                            if oldtype == 'Generic':
                                pass
                            else:
                                label.append(oldtype)
                                phrases.append(phrase)
                        text, offsets = [], []
                    text.append(words[0])
                    offsets.append(int(words[1]))
                elif words[2].startswith('I-'):
                    if words[2][2:] == oldevent:
                        text.append(words[0])
                        offsets.append(int(words[1]))
                    elif words[2][2:] != oldevent:
                        if text:
                            phrase = {
                                'surface': " ".join(text),
                                'realislabel': oldtype,
                                'offset': offsets,
                                'eventtype': oldevent
                            }
                            if labeloption == 1:  # generic vs specific
                                if oldtype == 'Other' or oldtype == 'Actual':
                                    label.append("NotGeneric")
                                else:
                                    label.append("Generic")
                                phrases.append(phrase)
                            elif labeloption == 2:  # not generic -> actual vs other
                                if oldtype == 'General':
                                    pass
                                else:
                                    label.append(oldtype)
                                    phrases.append(phrase)
                            text, offsets = [], []
                        text.append(words[0])
                        offsets.append(int(words[1]))

                oldtype = words[4]
                oldevent = words[2][2:]

        else:
            if text:
                phrase = {
                    'surface': " ".join(text),
                    'realislabel': oldtype,
                    'offset': offsets,
                    'eventtype': oldevent
                }
                if labeloption == 1:  # generic vs specific
                    if oldtype == 'Other' or oldtype == 'Actual':
                        label.append("NotGeneric")
                    else:
                        label.append("Generic")
                    phrases.append(phrase)
                elif labeloption == 2:  # not general -> actual vs other
                    if oldtype == 'Generic':
                        pass
                    else:
                        label.append(oldtype)
                        phrases.append(phrase)
                text, offsets = [], []

            if len(phrases) > 0 and len(label) > 0:
                sentences.append(phrases)
                labels.append(label)
                phrases = []
                label = []
            elif len(phrases) == 0 and i < len(lines) - 1:
                sentences.append([])
                labels.append([])

    return sentences, labels
Ejemplo n.º 7
0
def argument_to_ann(dir, result):
    for fileno in result.keys():
        eventid = 1
        tokid = 1
        jfile = dir + fileno + '.content.json'
        content = utils.loadJsontoDict(jfile)
        sentences = content['sentences']

        txtfile = dir + fileno + '.txt'
        head = cuthead(txtfile)

        annfile = dir + fileno + '_pred.ann'
        content = utils.readFileEncode(annfile, 'utf8')
        token, event, relationlist, attrlist = ann2xml.readAnn(content)

        idx, event = [], {}

        for tokenid in token.keys():
            # find the last token id from ann
            idx.append(int(tokenid.replace('T', '')))
            event[eventid] = {}
            event[eventid]['triggertokenid'] = tokenid
            event[eventid]['name'] = token[tokenid]['label']
            event[eventid]['arguments'] = []
            eventid += 1
        tokid = max(idx) + 1

        f = codecs.open(annfile, 'a', 'utf8')
        sample, offset = [], []
        oldlabel = 'O'
        for wordno in result[fileno].keys():
            predlabel = result[fileno][wordno]['pred']
            if predlabel.startswith('B-'):
                if sample:
                    text = " ".join(sample)
                    startoffset = offset[0] + head
                    endoffset = offset[-1] + len(sample[-1]) + head
                    f.write('T' + str(tokid) + '\t' + label + ' ' +
                            str(startoffset) + ' ' + str(endoffset) + '\t' +
                            text + '\n')
                    for eventid in event.keys():
                        trggrtokenid = event[eventid]['triggertokenid']
                        if result[fileno][wordno][
                                'triggerposition'] == 'before':
                            if startoffset > int(
                                    token[trggrtokenid]['endOffset']):
                                if result[fileno][wordno][
                                        'nearevent'] == token[trggrtokenid][
                                            'label'] and result[fileno][
                                                wordno]['neartrigger'] in token[
                                                    trggrtokenid]['text']:
                                    event[eventid]['arguments'].append({
                                        'argname':
                                        label,
                                        'value':
                                        tokid
                                    })
                                    break
                        elif result[fileno][wordno][
                                'triggerposition'] == 'after':
                            if endoffset > int(
                                    token[trggrtokenid]['startOffset']):
                                if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \
                                        result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']:
                                    event[eventid]['arguments'].append({
                                        'argname':
                                        label,
                                        'value':
                                        tokid
                                    })
                                    break
                        else:
                            if startoffset > int(token[tokenid]['endOffset']):
                                if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \
                                        result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']:
                                    event[eventid]['arguments'].append({
                                        'argname':
                                        label,
                                        'value':
                                        tokid
                                    })
                                    break
                    tokid += 1

                sample, offset = [], []
                sample.append(result[fileno][wordno]['text'])
                offset.append(result[fileno][wordno]['offset'])
                label = predlabel[2:]
            elif predlabel.startswith('I-'):
                if predlabel[2:] != oldlabel[2:]:

                    if sample:
                        text = " ".join(sample)
                        startoffset = offset[0] + head
                        endoffset = offset[-1] + len(sample[-1]) + head
                        f.write('T' + str(tokid) + '\t' + label + ' ' +
                                str(startoffset) + ' ' + str(endoffset) +
                                '\t' + text + '\n')
                        for eventid in event.keys():
                            trggrtokenid = event[eventid]['triggertokenid']
                            if result[fileno][wordno][
                                    'triggerposition'] == 'before':
                                if startoffset > int(
                                        token[trggrtokenid]['endOffset']):
                                    if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \
                                            result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']:
                                        event[eventid]['arguments'].append({
                                            'argname':
                                            label,
                                            'value':
                                            tokid
                                        })
                                        break
                            elif result[fileno][wordno][
                                    'triggerposition'] == 'after':
                                if endoffset > int(
                                        token[trggrtokenid]['startOffset']):
                                    if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \
                                            result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']:
                                        event[eventid]['arguments'].append({
                                            'argname':
                                            label,
                                            'value':
                                            tokid
                                        })
                                        break
                            else:
                                if startoffset > int(
                                        token[tokenid]['endOffset']):
                                    if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \
                                            result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']:
                                        event[eventid]['arguments'].append({
                                            'argname':
                                            label,
                                            'value':
                                            tokid
                                        })
                                        break
                        tokid += 1
                    sample, offset = [], []
                    sample.append(result[fileno][wordno]['text'])
                    offset.append(result[fileno][wordno]['offset'])
                    label = predlabel[2:]
                else:
                    sample.append(result[fileno][wordno]['text'])
                    offset.append(result[fileno][wordno]['offset'])
            oldlabel = result[fileno][wordno]['pred']
        if sample:
            text = " ".join(sample)
            startoffset = offset[0] + head
            endoffset = offset[-1] + len(sample[-1]) + head
            f.write('T' + str(tokid) + '\t' + label + ' ' + str(startoffset) +
                    ' ' + str(endoffset) + '\t' + text + '\n')
            for eventid in event.keys():
                trggrtokenid = event[eventid]['triggertokenid']
                if result[fileno][wordno]['triggerposition'] == 'before':
                    if startoffset > int(token[trggrtokenid]['endOffset']):
                        if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \
                                result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']:
                            event[eventid]['arguments'].append({
                                'argname': label,
                                'value': tokid
                            })
                            break
                elif result[fileno][wordno]['triggerposition'] == 'after':
                    if endoffset > int(token[trggrtokenid]['startOffset']):
                        if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \
                                result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']:
                            event[eventid]['arguments'].append({
                                'argname': label,
                                'value': tokid
                            })
                            break
                else:
                    if startoffset > int(token[tokenid]['endOffset']):
                        if result[fileno][wordno]['nearevent'] == token[trggrtokenid]['label'] and \
                                result[fileno][wordno]['neartrigger'] in token[trggrtokenid]['text']:
                            event[eventid]['arguments'].append({
                                'argname': label,
                                'value': tokid
                            })
                            break

        for eventid in event.keys():
            f.write('E' + str(eventid) + '\t' + event[eventid]['name'] + ':' +
                    event[eventid]['triggertokenid'])
            for arg in event[eventid]['arguments']:
                f.write(' ' + arg['argname'] + ':' + 'T' + str(arg['value']))
            f.write('\n')

        f.close()
Ejemplo n.º 8
0
def cuthead(txtfile):
    """ find header size """
    txt = utils.readFileEncode(txtfile, 'utf-8')
    return txt.index("<text>") + 7