def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r"))
    # read articles
    contentJson = json.load(open(os.path.join(inputPath,filename),"r"))
    print "Worker %d : Read %s into filter" % (jobid,filename)

    politicalPosition = ["Secretary","Premier","Mayor","Captain","Minister","Chief","Governor","General","Ambassadors","Member"]

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])     # get revid
        targetName = projizz.getNamedEntityTokens(ans["_id"])   # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["properties"]
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])
        relaEx = []
        for line in ptnEx:                      # line[0]: line number

            lineText = article[line[0]]
            named = False
            for namedToken in targetName:
                if namedToken in lineText:
                    named = True
                    break

            if not named:   # No target name in line text
                continue    # go to next line.

            for ptn in line[1]:                 # line[1]: array of patterns
                ptnId = "%d" % (ptn[0])         # ptn[0]:  pattern ID, [1]: start, [2]: end
                rfp = table[ptnId]["relations"]
                
                # ignore non-used pattern
                if not table[ptnId]["used"]:
                    continue
                if "eval" in table[ptnId] and not table[ptnId]["eval"]:
                    continue

                # never seen pattern
                if not ptnId in st:
                    continue
                
                # if only one relation
                if len(rfp) < 2:

                    if "holdsPoliticalPosition" in rfp:
                        foundPosition = False
                        for position in politicalPosition:
                            if position in lineText and not "holdsPoliticalPosition" in relaEx:
                                relaEx.append("holdsPoliticalPosition")
                                break
                        if foundPosition:
                            continue


                    if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                        relaEx.append(rfp[0])

                # more than one relation
                else:

                    if "holdsPoliticalPosition" in rfp:
                        foundPosition = False
                        for position in politicalPosition:
                            if position in lineText and not "holdsPoliticalPosition" in relaEx:
                                relaEx.append("holdsPoliticalPosition")
                                break
                        if foundPosition:
                            continue


                    # using the first as the answer
                    if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                        relaEx.append(rfp[0])

        # Remove impossible relations
        toBeRemove = []
        for attribute in relaEx:
            # speical case, produced
            if domainRange[attribute] == "":
                continue

            if not domainRange[attribute]["domain"] in types:
                if not attribute in toBeRemove:
                    toBeRemove.append(attribute)

        for attribute in toBeRemove:
            relaEx.remove(attribute)

        # Evaluation
        for attribute in partAns:
            postive = False
            true = False

            if attribute in relaEx:
                postive = True
            if attribute in relation:
                true = True

            if true:
                if postive:
                    partAns[attribute]["tp"].append(ans["revid"])
                else:
                    partAns[attribute]["fn"].append(ans["revid"])
            else:
                if postive:
                    partAns[attribute]["fp"].append(ans["revid"])
                else:
                    partAns[attribute]["tn"].append(ans["revid"])
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)
    return partAns
Esempio n. 2
0
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st,
                   domainRange, inputPath):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r"))
    # read articles
    contentJson = json.load(open(os.path.join(inputPath, filename), "r"))
    print "Worker %d : Read %s into filter" % (jobid, filename)

    politicalPosition = [
        "Secretary", "Premier", "Mayor", "Captain", "Minister", "Chief",
        "Governor", "General", "Ambassadors", "Member"
    ]

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        targetName = projizz.getNamedEntityTokens(
            ans["_id"])  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["properties"]
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])
        relaEx = []
        for line in ptnEx:  # line[0]: line number

            lineText = article[line[0]]
            named = False
            for namedToken in targetName:
                if namedToken in lineText:
                    named = True
                    break

            if not named:  # No target name in line text
                continue  # go to next line.

            for ptn in line[1]:  # line[1]: array of patterns
                ptnId = "%d" % (ptn[0]
                                )  # ptn[0]:  pattern ID, [1]: start, [2]: end
                rfp = table[ptnId]["relations"]

                # ignore non-used pattern
                if not table[ptnId]["used"]:
                    continue
                if "eval" in table[ptnId] and not table[ptnId]["eval"]:
                    continue

                # never seen pattern
                if not ptnId in st:
                    continue

                # if only one relation
                if len(rfp) < 2:

                    if "holdsPoliticalPosition" in rfp:
                        foundPosition = False
                        for position in politicalPosition:
                            if position in lineText and not "holdsPoliticalPosition" in relaEx:
                                relaEx.append("holdsPoliticalPosition")
                                break
                        if foundPosition:
                            continue

                    if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                        relaEx.append(rfp[0])

                # more than one relation
                else:

                    if "holdsPoliticalPosition" in rfp:
                        foundPosition = False
                        for position in politicalPosition:
                            if position in lineText and not "holdsPoliticalPosition" in relaEx:
                                relaEx.append("holdsPoliticalPosition")
                                break
                        if foundPosition:
                            continue

                    # using the first as the answer
                    if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                        relaEx.append(rfp[0])

        # Remove impossible relations
        toBeRemove = []
        for attribute in relaEx:
            # speical case, produced
            if domainRange[attribute] == "":
                continue

            if not domainRange[attribute]["domain"] in types:
                if not attribute in toBeRemove:
                    toBeRemove.append(attribute)

        for attribute in toBeRemove:
            relaEx.remove(attribute)

        # Evaluation
        for attribute in partAns:
            postive = False
            true = False

            if attribute in relaEx:
                postive = True
            if attribute in relation:
                true = True

            if true:
                if postive:
                    partAns[attribute]["tp"].append(ans["revid"])
                else:
                    partAns[attribute]["fn"].append(ans["revid"])
            else:
                if postive:
                    partAns[attribute]["fp"].append(ans["revid"])
                else:
                    partAns[attribute]["tn"].append(ans["revid"])
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)
    return partAns
Esempio n. 3
0
def updateAnswer(jobid, inputPath, filename):
    contenJson = projizz.jsonRead(os.path.join(inputPath, filename))
    print "#%d - %s" % (jobid, filename)
    connect = Connection()
    answerCollection = connect.projizz.result.yago.answer
    factCollection = connect.projizz.yago.facts

    queries = map(lambda x: x[:-4], contenJson)

    itr = answerCollection.find({"revid": {"$in": queries}})
    print "#%d - query=%d,result=%d" % (jobid, len(queries), itr.count())

    count = 0
    ty1g = 0
    ty2g = 0
    updateC = 0
    articles = []
    for ans in itr:
        count += 1
        articleID = "%s.txt" % (ans["revid"])
        articleName = ans["_id"]
        properties = ans["properties"]
        #not consider references.
        #references = ans["references"]

        if len(properties) == 0:
            # give up those no properties' article
            # print "#%d - give up %s (1)" % (jobid,articleID)
            ty1g += 1
            continue

        needUpdate = len(properties)

        lines = projizz.articleSimpleSentenceFileter(contenJson[articleID])
        text = ""
        for line in lines:
            text += (line + " ")

        observed = []
        for pro in properties:

            pitr = factCollection.find({
                "property": pro,
                "subject": articleName
            })
            if pitr.count() < 1:
                notNeed.append(pro)
                continue

            found = False
            for fact in pitr:
                tokens = projizz.getNamedEntityTokens(fact["object"])
                for token in tokens:
                    if token in text:
                        found = True
                        break
                if found:
                    break
            if found:
                observed.append(pro)

        if len(observed) > 0:
            articles.append(articleID)
            ans["observed"] = observed
            answerCollection.update({"revid": ans["revid"]}, ans, upsert=False)
        else:
            ty2g += 1
            #print "#%d - give up %s (2)" % (jobid,articleID)

    print "#%d -> update %d (give up %d + %d)" % (jobid, len(articles), ty1g,
                                                  ty2g)

    return (filename, articles)
def updateAnswer(jobid,inputPath,filename):
    contenJson = projizz.jsonRead(os.path.join(inputPath,filename))
    print "#%d - %s" % (jobid,filename)
    connect = Connection()
    answerCollection = connect.projizz.result.yago.answer
    factCollection = connect.projizz.yago.facts

    queries = map(lambda x: x[:-4], contenJson)

    itr = answerCollection.find({"revid":{"$in":queries}})
    print "#%d - query=%d,result=%d" % (jobid,len(queries),itr.count())
    
    count = 0
    ty1g = 0
    ty2g = 0
    updateC = 0
    articles = []
    for ans in itr:
        count += 1
        articleID = "%s.txt" % (ans["revid"])
        articleName = ans["_id"]
        properties = ans["properties"]
        #not consider references.
        #references = ans["references"]

        if len(properties) == 0:
            # give up those no properties' article
            # print "#%d - give up %s (1)" % (jobid,articleID)
            ty1g += 1
            continue
        
        needUpdate = len(properties)

        lines = projizz.articleSimpleSentenceFileter(contenJson[articleID])
        text = ""
        for line in lines:
            text += (line + " ")

        observed = []
        for pro in properties:
            
            pitr = factCollection.find({"property":pro,"subject":articleName})
            if pitr.count() < 1:
                notNeed.append(pro)
                continue

            found = False
            for fact in pitr:
                tokens = projizz.getNamedEntityTokens(fact["object"])
                for token in tokens:
                    if token in text:
                        found = True
                        break
                if found:
                    break
            if found:
                observed.append(pro)
            
        if len(observed) > 0:
            articles.append(articleID)
            ans["observed"] = observed
            answerCollection.update({"revid":ans["revid"]},ans,upsert=False)
        else:
            ty2g += 1
            #print "#%d - give up %s (2)" % (jobid,articleID)

    print "#%d -> update %d (give up %d + %d)" % (jobid,len(articles),ty1g,ty2g)

    return (filename,articles)