def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath,filename))
    
    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s into filter" % (jobid,filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info
    
    expResult = {}

    for deg in range(1,6):
        for typ in ["n","t"]:
            if not deg == 1:
                for amb in ["one","50","75","all"]:
                    keyname = "%d-%s-%s" % (deg,amb,typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg,typ)
                expResult[keyname] = copy.deepcopy(partAns)
    
    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])     # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]
        
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ    = args[2]

            # Relation extraction
            relaEx = []
            ptnExRela = {}  # rela: ptns

            def recordPtnMakeRela(ptnId,rela,record):
                if not rela in record:
                    record[rela] = []
                if not ptnId in record[rela]:
                    record[rela].append(ptnId)

            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue
                
                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    # validate the pattern 
                    if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                        continue

                    # get all possible relation of this pattern
                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    #
                    #   Decide to choice relation
                    # 

                    if len(rfp) == 1:   # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    pr = rfp[0]
                                    if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos":
                                        relaEx.append(rfp[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                            else:
                                pr = rfp[0]
                                if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                                            break
                            
                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx and not classifiers[rfp[0]] == None and classifiers[rfp[0]].classify(lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                                
                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                                else:
                                    if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                        relaEx.append(ptnst[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5
                            
                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"])/float(b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                                        else:
                                            if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)

            
            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True
            
                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(ans["revid"])
                else:
                    if postive:
                        # False Positive
                        expResult[keyname][attribute]["fp"].append(ans["revid"])
                        # TODO - 分析錯誤原因
                        if attribute in ptnExRela:
                            if attribute in originRela:
                                # type 2 error
                                expResult[keyname][attribute]["et2"].append(ans["revid"])
                            else:
                                found = False
                                ptns =  ptnExRela[attribute]    # get the patterns raise the Relation
                                for pid in ptns:
                                    for psbR in table[pid]["relations"]:
                                        if psbR == attribute:
                                            continue

                                        # here means that the pattern can raise a `correct' relation in answer, may it choice or not
                                        if domainRange[psbR]["domain"] in types and psbR in relation:
                                            found = True
                                            break

                                if found:
                                    # type 1 error
                                    expResult[keyname][attribute]["et1"].append(ans["revid"])
                                else:
                                    # type 3 error
                                    expResult[keyname][attribute]["et3"].append(ans["revid"])
                        else:
                            # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD
                            pass
                    else:
                        # ignore true-negative
                        pass
    
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return expResult
Beispiel #2
0
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st,
                   domainRange, inputPath, confidence, nbcPath):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))

    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s into filter" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info

    expResult = {}

    for deg in range(1, 6):
        for typ in ["n", "t"]:
            if not deg == 1:
                for amb in ["one", "50", "75", "all"]:
                    keyname = "%d-%s-%s" % (deg, amb, typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg, typ)
                expResult[keyname] = copy.deepcopy(partAns)

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ = args[2]

            # Relation extraction
            relaEx = []
            ptnExRela = {}  # rela: ptns

            def recordPtnMakeRela(ptnId, rela, record):
                if not rela in record:
                    record[rela] = []
                if not ptnId in record[rela]:
                    record[rela].append(ptnId)

            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[
                        0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue

                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    # validate the pattern
                    if not projizz.isPatternValidate(
                            ptnId, table, confidence=confidence, st=st):
                        continue

                    # get all possible relation of this pattern
                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    #
                    #   Decide to choice relation
                    #

                    if len(rfp) == 1:  # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[
                                0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    pr = rfp[0]
                                    if not classifiers[
                                            pr] == None and classifiers[
                                                pr].classify(
                                                    lineText) == "pos":
                                        relaEx.append(rfp[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(
                                            ptnId, rfp[0], ptnExRela)
                            else:
                                pr = rfp[0]
                                if not classifiers[pr] == None and classifiers[
                                        pr].classify(lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[
                                                0] in relaEx and not classifiers[
                                                    ptnst[
                                                        0]] == None and classifiers[
                                                            ptnst[0]].classify(
                                                                lineText
                                                            ) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(
                                                ptnId, ptnst[0], ptnExRela)
                                            break

                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[
                                        0] in relaEx and not classifiers[
                                            rfp[0]] == None and classifiers[
                                                rfp[0]].classify(
                                                    lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)

                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[
                                                0] in relaEx and not classifiers[
                                                    ptnst[
                                                        0]] == None and classifiers[
                                                            ptnst[0]].classify(
                                                                lineText
                                                            ) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(
                                                ptnId, ptnst[0], ptnExRela)
                                else:
                                    if not ptnst[
                                            0] in relaEx and not classifiers[
                                                ptnst[
                                                    0]] == None and classifiers[
                                                        ptnst[0]].classify(
                                                            lineText) == "pos":
                                        relaEx.append(ptnst[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(
                                            ptnId, ptnst[0], ptnExRela)
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5

                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"]) / float(
                                            b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]][
                                                    "domain"] in types and not ptnst[
                                                        0] in relaEx and not classifiers[
                                                            ptnst[0]] == None and classifiers[
                                                                ptnst[
                                                                    0]].classify(
                                                                        lineText
                                                                    ) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(
                                                    ptnId, ptnst[0], ptnExRela)
                                        else:
                                            if not ptnst[0] in relaEx and not classifiers[
                                                    ptnst[
                                                        0]] == None and classifiers[
                                                            ptnst[0]].classify(
                                                                lineText
                                                            ) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(
                                                    ptnId, ptnst[0], ptnExRela)

            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True

                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(
                            ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(
                            ans["revid"])
                else:
                    if postive:
                        # False Positive
                        expResult[keyname][attribute]["fp"].append(
                            ans["revid"])
                        # TODO - 分析錯誤原因
                        if attribute in ptnExRela:
                            if attribute in originRela:
                                # type 2 error
                                expResult[keyname][attribute]["et2"].append(
                                    ans["revid"])
                            else:
                                found = False
                                ptns = ptnExRela[
                                    attribute]  # get the patterns raise the Relation
                                for pid in ptns:
                                    for psbR in table[pid]["relations"]:
                                        if psbR == attribute:
                                            continue

                                        # here means that the pattern can raise a `correct' relation in answer, may it choice or not
                                        if domainRange[psbR][
                                                "domain"] in types and psbR in relation:
                                            found = True
                                            break

                                if found:
                                    # type 1 error
                                    expResult[keyname][attribute][
                                        "et1"].append(ans["revid"])
                                else:
                                    # type 3 error
                                    expResult[keyname][attribute][
                                        "et3"].append(ans["revid"])
                        else:
                            # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD
                            pass
                    else:
                        # ignore true-negative
                        pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
Beispiel #3
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns,
           domainRange, confidence, nbcPath):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0
    expResult = partAns
    relaEx = []

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            lineText = article[line[0]]
            if lineText[
                    0] == "^":  # It's a wikipeida reference comments, ignore it!
                continue

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]

                if not projizz.isPatternValidate(
                        ptnId, table, confidence=confidence, st=st):
                    continue

                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                for ptnst in st[ptnId]:
                    # ptnst[0] = relation
                    # ptnst[1] = {"support":,"total": }
                    if domainRange[ptnst[0]] not in types:
                        continue

                    if classifiers[ptnst[0]] == None:
                        continue

                    if classifiers[ptnst[0]].classify(lineText) == "pos":
                        if not ptnst[0] in relaEx:
                            relaEx.append(ptnst[0])

        #### Evaluation
        for attribute in expResult:

            # special case, ignore.
            if attribute == "produced":
                continue

            postive = False
            true = False

            if attribute in relaEx:
                postive = True
            if attribute in relation:
                true = True

            if true:
                if postive:
                    expResult[attribute]["tp"].append(ans["revid"])
                else:
                    expResult[attribute]["fn"].append(ans["revid"])
            else:
                if postive:
                    expResult[attribute]["fp"].append(ans["revid"])
                else:
                    # ignore true-negative
                    pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
Beispiel #4
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, nbcPath):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0
    expResult = partAns
    relaEx = []

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        # targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            lineText = article[line[0]]
            if lineText[0] == "^":  # It's a wikipeida reference comments, ignore it!
                continue

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]

                if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                    continue

                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                for ptnst in st[ptnId]:
                    # ptnst[0] = relation
                    # ptnst[1] = {"support":,"total": }
                    if domainRange[ptnst[0]] not in types:
                        continue

                    if classifiers[ptnst[0]] == None:
                        continue

                    if classifiers[ptnst[0]].classify(lineText) == "pos":
                        if not ptnst[0] in relaEx:
                            relaEx.append(ptnst[0])

        #### Evaluation
        for attribute in expResult:

            # special case, ignore.
            if attribute == "produced":
                continue

            postive = False
            true = False

            if attribute in relaEx:
                postive = True
            if attribute in relation:
                true = True

            if true:
                if postive:
                    expResult[attribute]["tp"].append(ans["revid"])
                else:
                    expResult[attribute]["fn"].append(ans["revid"])
            else:
                if postive:
                    expResult[attribute]["fp"].append(ans["revid"])
                else:
                    # ignore true-negative
                    pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult