예제 #1
0
def final_scan(exdeflike,
               indeflike,
               multi=True,
               lowercase=False,
               externalPOS=False,
               outkey="default"):
    """train all of the given training data and then test it on the supplied
    test records. make predictions for NE for each token, then print them
    out in the format required"""
    """
    #load exdef and indef
    with open('finalexdef.pickle', 'rb') as infile:
        exdeflike = pickle.load(infile)

    with open('finalindef.pickle', 'rb') as infile2:
        indeflike = pickle.load(infile2)
    """
    #load the test data
    test_file = '../data/emerging.dev.conll'
    with open(test_file, 'r') as f3:
        records = re.split("\n[\t]?\n", f3.read().strip())

    numrecs = len(records)

    os.system("mkdir -p ../data/predictions/" + outkey)

    #analyze the test data
    # threshold = [0.138, 0.13]
    #this is the threshold we found to give the best F1 score
    #on the training data, using n-fold validation

    tstarts = {
        "location": 0.5,
        "group": 0.5,
        "product": 0.5,
        "creative-work": 0.5,
        "person": 0.5,
        "corporation": 0.5
    }

    tdiffs = range(-4,
                   5)  # [d for d in range(-49,50)] ## diffs = [-0.49--0.49]
    allresults = defaultdict(list)

    ## begin rounds loop here
    for rnd in range(1, 4):  ##range(1,2): ##
        print "working on rnd " + str(rnd)
        print "here are the starting thresholds: "
        for NEtype in tstarts:
            print NEtype, tstarts[NEtype]
        print
        results = defaultdict(list)
        tdiffs = [tdiff / 10 for tdiff in tdiffs]
        # [tdiff/100 for tdiff in tdiffs] [-0.0049--0.0049]

        NEthreshs = {
            NEtype: [tstarts[NEtype] + tdiff for tdiff in tdiffs]
            # [t/1000 for t in range(1001)]
            for NEtype in tstarts
        }
        fs = {}
        for NEtype in NEthreshs:
            for t in NEthreshs[NEtype]:
                fkey = str(t) + "-" + NEtype
                threshfile = re.sub("/data/",
                                    "/data/predictions/" + outkey + "/",
                                    test_file + "-" + fkey + ".prediction")
                fs[fkey] = [open(threshfile, 'w'), threshfile]
                fs[fkey][0].close()
        numdone = 0
        for record in records:
            print str(
                100 * numdone / numrecs) + "% done with round " + str(rnd)
            numdone += 1
            if record:  #avoid empty strings
                data = [
                    re.split('\t', d) for d in re.split("\n", record)
                    if len(re.split("\t", d)) == 2
                ]
                tokens, tags = zip(*data)
                uppertokens = list(tokens)
                if lowercase:
                    tokens = [token.lower() for token in tokens]

                for NEtype in NEthreshs:
                    exavedeflike, inavedeflike = NER.test(
                        tokens,
                        exdeflike,
                        indeflike,
                        multi,
                        NEtype,
                        externalPOS=externalPOS,
                        uppertokens=uppertokens)

                    for t in NEthreshs[NEtype]:
                        #keep track of the NE assignments
                        #for each token with tuples
                        if lowercase:
                            assignments = [[token, 'O']
                                           for token in uppertokens if token]
                        else:
                            assignments = [[token, 'O'] for token in tokens
                                           if token]
                        fkey = str(t) + "-" + NEtype
                        #find the NEs using the _LFD_ function as before
                        for indices in NER.LFD(tokens, exavedeflike,
                                               inavedeflike, [1.1, t]):

                            # print t, [tokens[ix] for ix in indices]
                            # innums = [inavedeflike[ix][1] for ix in indices if ix != indices[0]]
                            # innums.append(inavedeflike[indices[0]][0])
                            # print "internal", NER.harmonic_mean(innums)
                            # raw_input()

                            n = 0
                            for index in indices:
                                if n == 0:
                                    assignments[index][1] = 'B-' + NEtype
                                else:
                                    assignments[index][1] = 'I-' + NEtype
                                n += 1  #keep track of position in NE

                        ## write out according to file handles, here
                        fs[fkey][0] = open(fs[fkey][1], "a")
                        for i, assignment in enumerate(assignments):
                            fs[fkey][0].writelines("\t".join(
                                [assignment[0], tags[i], assignment[1]]) +
                                                   "\n")
                        fs[fkey][0].writelines("\n")
                        fs[fkey][0].close()

        ## evaluate all thresholds and all NE types for the best of the round
        for fkey in fs:
            ## fs[fkey][0].close()
            NEtype = "-".join(re.split("-", fkey)[1:])
            t = float(re.split("-", fkey)[0])
            filename = fs[fkey][1]
            try:
                results[NEtype].append((map(
                    float,
                    re.split("\;", [
                        re.sub("[^0-9\.\;]+", "", re.sub("\d+$|\d\:", "", r))
                        for r in re.split(
                            "\n",
                            subprocess.check_output(
                                "python2 ../data/wnuteval.py " + filename,
                                shell=True)) if re.search(NEtype, r)
                    ][0])), t))
            except:
                results[NEtype].append(([0., 0., 0.], t))
            allresults[NEtype].append(tuple(results[NEtype][-1]))

        ## store the best of this round as tstarts
        for NEtype in results:
            tstarts[NEtype] = max(results[NEtype], key=lambda x: x[0][2])[1]

        print "here are the end-of-round thresholds: "
        for NEtype in tstarts:
            print NEtype, tstarts[NEtype]
        print
    with open("../data/predictions/" + outkey + "/allresults.json", "w") as f:
        f.writelines(json.dumps([tstarts, allresults]))
    return tstarts, allresults
예제 #2
0
def final_analysis(
    exdeflikefile,
    indeflikefile,
    multi=True,
    lowercase=True,
    externalPOS=True,
    dev=True,
    thresholds={
        "location": 0.292,
        "group": 0.09,
        "product": 0.131,
        "creative-work": 1.1,
        "person": 0.202,
        "corporation": 1.1
    }):

    #load exdef and indef
    with open(exdeflikefile) as f:
        exdeflike = pickle.load(f)

    with open(indeflikefile) as f:
        indeflike = pickle.load(f)
    """train all of the given training data and then test it on the supplied
    test records. make predictions for NE for each token, then print them
    out in the format required"""
    """
    #load exdef and indef
    with open('finalexdef.pickle', 'rb') as infile:
        exdeflike = pickle.load(infile)

    with open('finalindef.pickle', 'rb') as infile2:
        indeflike = pickle.load(infile2)
    """
    #load the test data
    if dev:
        test_file = '../data/emerging.dev.conll'
        outfilename = "../data/finalpredictions/emerging_" + "_".join(
            re.split("_", indeflikefile)[1:3]) + ".dev"
    else:
        test_file = '../data/emerging.test'
        outfilename = "../data/finalpredictions/emerging_" + "_".join(
            re.split("_", indeflikefile)[1:3]) + ".test"

    with open(test_file, 'r') as f3:
        records = re.split("\n[\t]?\n", f3.read().strip())

    #analyze the test data
    #on the training data, using n-fold validation
    # f = open(test_file + ".prediction", 'w')
    f = open(outfilename, "w")

    # thresholds = {
    #     ## "location": [0.001, 0.157],
    #     "location": [1.1, 0.157],
    #     ## "group": [0.008, 0.199],
    #     "group": [1.1, 0.199],
    #     "product": [1.1, 0.215],
    #     "creative-work": [1.1,0.499],
    #     ## "person": [0.002, 0.167],
    #     "person": [1.1, 0.167],
    #     "corporation": [1.1, 0.218]
    # }

    for record in records:
        if record:  #avoid empty strings
            if dev:
                data = [
                    re.split('\t', d) for d in re.split("\n", record)
                    if len(re.split("\t", d)) == 2
                ]
                tokens, tags = zip(*data)
            else:
                tokens = [
                    re.split('\t', d)[0] for d in re.split("\n", record)
                    if len(re.split("\t", d)) == 1
                ]
            uppertokens = list(tokens)
            if lowercase:
                tokens = [token.lower() for token in tokens]
            #keep track of the NE assignments for each token with tuples
            if lowercase:
                assignments = [[token, 'O'] for token in uppertokens if token]
            else:
                assignments = [[token, 'O'] for token in tokens if token]
            ##

            predictions = {}

            for NEtype in thresholds:
                exavedeflike, inavedeflike = NER.test(tokens,
                                                      exdeflike,
                                                      indeflike,
                                                      multi,
                                                      NEtype,
                                                      externalPOS=externalPOS,
                                                      uppertokens=uppertokens)

                #find the NEs using the _LFD_ function as before
                for indices in NER.LFD(tokens, exavedeflike, inavedeflike,
                                       [1.1, thresholds[NEtype]]):
                    # if exavedeflike[indices] >= thresholds[NEtype][0]:
                    #     print NEtype+": ", [tokens[ix] for ix in indices]
                    #     print "external", exavedeflike[indices]
                    innums = [
                        inavedeflike[ix][1] for ix in indices
                        if ix != indices[0]
                    ]
                    innums.append(inavedeflike[indices[0]][0])
                    ## print "internal", NER.harmonic_mean(innums)

                    predictions[(indices, NEtype)] = [
                        len(list(indices)), exavedeflike[indices],
                        NER.harmonic_mean(innums)
                    ]
            ##
            for indices, NEtype in predictions:
                thissize = predictions[(indices, NEtype)][0]
                thislike = predictions[(indices, NEtype)][2]
                for otherindices, otherNEtype in predictions:
                    thatsize = predictions[(otherindices, otherNEtype)][0]
                    thatlike = predictions[(otherindices, otherNEtype)][2]
                    broken = True
                    for ix in otherindices:
                        if ix in indices:
                            if otherindices[0] < indices[0]:
                                print("precidence, avoided: ",
                                      [tokens[ix] for ix in indices], " over ",
                                      [tokens[ix] for ix in otherindices])
                                break
                            elif otherindices[0] == indices[0]:
                                if thatsize > thissize:
                                    print("size, avoided: ",
                                          [tokens[ix]
                                           for ix in indices], " over ",
                                          [tokens[ix] for ix in otherindices])
                                    break
                                elif thatlike > thislike:
                                    print("likelihood, avoided: " + NEtype,
                                          [tokens[ix] for ix in indices
                                           ], " over " + otherNEtype,
                                          [tokens[ix] for ix in otherindices])
                                    break
                    else:
                        broken = False
                    if broken:
                        break
                else:

                    print NEtype + ": ", [tokens[ix] for ix in indices]
                    print "internal", thislike
                    ##
                    #assign 'B' to the first, 'I' to the rest
                    n = 0
                    for index in indices:
                        if n == 0:
                            assignments[index][1] = 'B-' + NEtype
                        else:
                            assignments[index][1] = 'I-' + NEtype
                        n += 1  #keep track of position in NE

            ##

            for i, assignment in enumerate(assignments):
                if dev:
                    f.writelines(
                        "\t".join([assignment[0], tags[i], assignment[1]]) +
                        "\n")
                else:
                    f.writelines("\t".join([assignment[0], assignment[1]]) +
                                 "\n")

            f.writelines("\n")