Exemple #1
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns,
           domainRange, confidence, vsmData):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    print "Worker %d : Read %s" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0
    expResult = {}
    relaEx = {}

    # set thresholds
    for th in range(0, 51, 5):
        expResult[th] = copy.deepcopy(partAns)
        relaEx[th] = []

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # TODO

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]
                lineText = article[line[0]]

                if not projizz.isPatternValidate(
                        ptnId, table, confidence=confidence, st=st):
                    continue

                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                # TODO - Modlify string, remove pattern text in string?
                cosRlt = projizz.vsmSimilarity(lineText,
                                               vsmData,
                                               relas=rfp,
                                               ptntext=ptntks)

                # NOTE - if cosine value > threshold then there is a relation (?)
                for keyname in expResult:
                    threshold = float(keyname) / 100.0

                    for pr in cosRlt:
                        # Check type
                        if domainRange[pr]["domain"] in types:
                            if cosRlt[pr] > threshold:
                                if pr not in relaEx[keyname]:
                                    relaEx[keyname].append(pr)

        #### Evaluation
        for keyname in expResult:
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx[keyname]:
                    postive = True
                if attribute in relation:
                    true = True

                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(
                            ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(
                            ans["revid"])
                else:
                    if postive:
                        expResult[keyname][attribute]["fp"].append(
                            ans["revid"])
                    else:
                        # ignore true-negative
                        pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
Exemple #2
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns,
           domainRange, confidence, nbcPath):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0
    expResult = partAns
    relaEx = []

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            lineText = article[line[0]]
            if lineText[
                    0] == "^":  # It's a wikipeida reference comments, ignore it!
                continue

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]

                if not projizz.isPatternValidate(
                        ptnId, table, confidence=confidence, st=st):
                    continue

                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                for ptnst in st[ptnId]:
                    # ptnst[0] = relation
                    # ptnst[1] = {"support":,"total": }
                    if domainRange[ptnst[0]] not in types:
                        continue

                    if classifiers[ptnst[0]] == None:
                        continue

                    if classifiers[ptnst[0]].classify(lineText) == "pos":
                        if not ptnst[0] in relaEx:
                            relaEx.append(ptnst[0])

        #### Evaluation
        for attribute in expResult:

            # special case, ignore.
            if attribute == "produced":
                continue

            postive = False
            true = False

            if attribute in relaEx:
                postive = True
            if attribute in relation:
                true = True

            if true:
                if postive:
                    expResult[attribute]["tp"].append(ans["revid"])
                else:
                    expResult[attribute]["fn"].append(ans["revid"])
            else:
                if postive:
                    expResult[attribute]["fp"].append(ans["revid"])
                else:
                    # ignore true-negative
                    pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath,filename))
    
    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s into filter" % (jobid,filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info
    
    expResult = {}

    for deg in range(1,6):
        for typ in ["n","t"]:
            if not deg == 1:
                for amb in ["one","50","75","all"]:
                    keyname = "%d-%s-%s" % (deg,amb,typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg,typ)
                expResult[keyname] = copy.deepcopy(partAns)
    
    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])     # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]
        
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ    = args[2]

            # Relation extraction
            relaEx = []
            ptnExRela = {}  # rela: ptns

            def recordPtnMakeRela(ptnId,rela,record):
                if not rela in record:
                    record[rela] = []
                if not ptnId in record[rela]:
                    record[rela].append(ptnId)

            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue
                
                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    # validate the pattern 
                    if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                        continue

                    # get all possible relation of this pattern
                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    #
                    #   Decide to choice relation
                    # 

                    if len(rfp) == 1:   # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    pr = rfp[0]
                                    if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos":
                                        relaEx.append(rfp[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                            else:
                                pr = rfp[0]
                                if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                                            break
                            
                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx and not classifiers[rfp[0]] == None and classifiers[rfp[0]].classify(lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                                
                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                                else:
                                    if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                        relaEx.append(ptnst[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5
                            
                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"])/float(b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)
                                        else:
                                            if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(ptnId, ptnst[0], ptnExRela)

            
            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True
            
                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(ans["revid"])
                else:
                    if postive:
                        # False Positive
                        expResult[keyname][attribute]["fp"].append(ans["revid"])
                        # TODO - 分析錯誤原因
                        if attribute in ptnExRela:
                            if attribute in originRela:
                                # type 2 error
                                expResult[keyname][attribute]["et2"].append(ans["revid"])
                            else:
                                found = False
                                ptns =  ptnExRela[attribute]    # get the patterns raise the Relation
                                for pid in ptns:
                                    for psbR in table[pid]["relations"]:
                                        if psbR == attribute:
                                            continue

                                        # here means that the pattern can raise a `correct' relation in answer, may it choice or not
                                        if domainRange[psbR]["domain"] in types and psbR in relation:
                                            found = True
                                            break

                                if found:
                                    # type 1 error
                                    expResult[keyname][attribute]["et1"].append(ans["revid"])
                                else:
                                    # type 3 error
                                    expResult[keyname][attribute]["et3"].append(ans["revid"])
                        else:
                            # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD
                            pass
                    else:
                        # ignore true-negative
                        pass
    
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return expResult
def mapper(jobid, filename, inputPath, inputPtnPath, model, table, confidence):

    # Read article
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    # Read ptn
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    print "Worker %d : Read %s into filter" % (jobid, filename)

    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    supportInstanceByFile = {}

    linesByRelations = {}
    linesNoRelaByRelations = {}

    POS = {}
    NEG = {}

    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        supportInstanceByFile[key] = {}
        linesByRela = {}
        linesByNoRela = {}

        pos = {}
        neg = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(
                        ptnId, table, confidence=confidence):
                    continue

                # give up degree > 5 's pattern
                if len(table[ptnId]["relations"]) > 5:
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:

                        # NOTE - remove pattern text.
                        if not rela in linesByRela:
                            linesByRela[rela] = {}
                        if not line[0] in linesByRela[rela]:
                            linesByRela[rela][line[0]] = []
                        if not ptnId in linesByRela[rela][line[0]]:
                            linesByRela[rela][line[0]].append(ptnId)

                        # For binary classifier
                        if not rela in pos:
                            pos[rela] = []
                        if not lineText[0] == "^" and line[0] not in pos[rela]:
                            pos[rela].append(line[0])

                    else:
                        if not rela in linesByNoRela:
                            linesByNoRela[rela] = {}
                        if not line[0] in linesByNoRela[rela]:
                            linesByNoRela[rela][line[0]] = []
                        if not ptnId in linesByNoRela[rela][line[0]]:
                            linesByNoRela[rela][line[0]].append(ptnId)

                        # For binary classifier
                        if not rela in neg:
                            neg[rela] = []
                        if not lineText[0] == "^" and line[0] not in neg[rela]:
                            neg[rela].append(line[0])

        for rela in linesByRela:
            if not rela in linesByRelations:
                linesByRelations[rela] = []
            for lineN in linesByRela[rela]:
                text = projizz.getTokens(article[lineN].lower())
                for ptnId in linesByRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesByRelations[rela].append(l)

        for rela in linesByNoRela:
            if not rela in linesNoRelaByRelations:
                linesNoRelaByRelations[rela] = []
            for lineN in linesByNoRela[rela]:
                text = projizz.getTokens(article[lineN].lower())
                for ptnId in linesByNoRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesNoRelaByRelations[rela].append(l)

        # For binary classifier
        for rela in pos:
            if not rela in POS:
                POS[rela] = []
            for lineN in pos[rela]:
                POS[rela].append({"text": article[lineN], "label": "pos"})

        for rela in neg:
            if not rela in NEG:
                NEG[rela] = []
            for lineN in neg[rela]:
                NEG[rela].append({"text": article[lineN], "label": "neg"})

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return linesByRelations, linesNoRelaByRelations, POS, NEG
Exemple #5
0
def mapper(jobid, filename, inputPath, inputPtnPath, model, table):

    # Read article
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    # Read ptn
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    print "Worker %d : Read %s into filter" % (jobid, filename)

    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    supportInstanceByFile = {}

    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        supportInstanceByFile[key] = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(ptnId, table):
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:

                        if not ptnId in supportInstanceByFile[key]:
                            supportInstanceByFile[key][ptnId] = {}
                        if not rela in supportInstanceByFile[key][ptnId]:
                            supportInstanceByFile[key][ptnId][rela] = []

                        if not line[0] in supportInstanceByFile[key][ptnId][
                                rela]:
                            supportInstanceByFile[key][ptnId][rela].append(
                                line[0])

        for ptnId in supportInstanceByFile[key]:
            for rela in supportInstanceByFile[key][ptnId]:
                lines = supportInstanceByFile[key][ptnId][rela]
                supportInstanceByFile[key][ptnId][rela] = []
                for lineN in lines:
                    supportInstanceByFile[key][ptnId][rela].append(
                        article[lineN])

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return supportInstanceByFile
Exemple #6
0
def generate(inputSPIpath, inputTestPath, outputVSMpath, confidence):

    # Checking output path
    projizz.checkPath(outputVSMpath)

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)

    # Collect not used keys
    # because using 5-fold CV
    t = 0
    result = []
    for filename in os.listdir(inputTestPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(mapper, (t, filename, inputTestPath)))
            t += 1
    pool.close()
    pool.join()

    notUsedKeys = []
    for r in result:
        ks = r.get()
        notUsedKeys += ks

    ### Build Model
    # Paatern Selection
    modelArticles = projizz.buildYagoProperties([])
    words = []
    count = 0
    for filename in os.listdir(inputSPIpath):
        if ".json" in filename:
            ptnId = filename[:-5]

            # ignore invalidate pattern
            if not projizz.isPatternValidate(
                    ptnId, table, confidence=confidence):
                continue

            count += 1
            print count, ptnId

            ptnInstance = projizz.jsonRead(os.path.join(
                inputSPIpath, filename))
            for rela in ptnInstance:
                for key in ptnInstance[rela]:
                    # ignore in testing data's key
                    if key in notUsedKeys:
                        continue

                    for line in ptnInstance[rela][key]:
                        modelArticles[rela].append(line)

            if count % 100 == 0:
                print "Read", count, "files"

    for relation in modelArticles:
        print relation
        projizz.jsonWrite(modelArticles[relation],
                          os.path.join(outputVSMpath, "%s.txt" % (relation)))
Exemple #7
0
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st,
                   domainRange, inputPath, confidence, nbcPath):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))

    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s into filter" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info

    expResult = {}

    for deg in range(1, 6):
        for typ in ["n", "t"]:
            if not deg == 1:
                for amb in ["one", "50", "75", "all"]:
                    keyname = "%d-%s-%s" % (deg, amb, typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg, typ)
                expResult[keyname] = copy.deepcopy(partAns)

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ = args[2]

            # Relation extraction
            relaEx = []
            ptnExRela = {}  # rela: ptns

            def recordPtnMakeRela(ptnId, rela, record):
                if not rela in record:
                    record[rela] = []
                if not ptnId in record[rela]:
                    record[rela].append(ptnId)

            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[
                        0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue

                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    # validate the pattern
                    if not projizz.isPatternValidate(
                            ptnId, table, confidence=confidence, st=st):
                        continue

                    # get all possible relation of this pattern
                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    #
                    #   Decide to choice relation
                    #

                    if len(rfp) == 1:  # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[
                                0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    pr = rfp[0]
                                    if not classifiers[
                                            pr] == None and classifiers[
                                                pr].classify(
                                                    lineText) == "pos":
                                        relaEx.append(rfp[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(
                                            ptnId, rfp[0], ptnExRela)
                            else:
                                pr = rfp[0]
                                if not classifiers[pr] == None and classifiers[
                                        pr].classify(lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)
                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[
                                                0] in relaEx and not classifiers[
                                                    ptnst[
                                                        0]] == None and classifiers[
                                                            ptnst[0]].classify(
                                                                lineText
                                                            ) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(
                                                ptnId, ptnst[0], ptnExRela)
                                            break

                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[
                                        0] in relaEx and not classifiers[
                                            rfp[0]] == None and classifiers[
                                                rfp[0]].classify(
                                                    lineText) == "pos":
                                    relaEx.append(rfp[0])
                                    # FIXME For error checking
                                    recordPtnMakeRela(ptnId, rfp[0], ptnExRela)

                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[
                                                0] in relaEx and not classifiers[
                                                    ptnst[
                                                        0]] == None and classifiers[
                                                            ptnst[0]].classify(
                                                                lineText
                                                            ) == "pos":
                                            relaEx.append(ptnst[0])
                                            # FIXME For error checking
                                            recordPtnMakeRela(
                                                ptnId, ptnst[0], ptnExRela)
                                else:
                                    if not ptnst[
                                            0] in relaEx and not classifiers[
                                                ptnst[
                                                    0]] == None and classifiers[
                                                        ptnst[0]].classify(
                                                            lineText) == "pos":
                                        relaEx.append(ptnst[0])
                                        # FIXME For error checking
                                        recordPtnMakeRela(
                                            ptnId, ptnst[0], ptnExRela)
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5

                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"]) / float(
                                            b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]][
                                                    "domain"] in types and not ptnst[
                                                        0] in relaEx and not classifiers[
                                                            ptnst[0]] == None and classifiers[
                                                                ptnst[
                                                                    0]].classify(
                                                                        lineText
                                                                    ) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(
                                                    ptnId, ptnst[0], ptnExRela)
                                        else:
                                            if not ptnst[0] in relaEx and not classifiers[
                                                    ptnst[
                                                        0]] == None and classifiers[
                                                            ptnst[0]].classify(
                                                                lineText
                                                            ) == "pos":
                                                relaEx.append(ptnst[0])
                                                # FIXME For error checking
                                                recordPtnMakeRela(
                                                    ptnId, ptnst[0], ptnExRela)

            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True

                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(
                            ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(
                            ans["revid"])
                else:
                    if postive:
                        # False Positive
                        expResult[keyname][attribute]["fp"].append(
                            ans["revid"])
                        # TODO - 分析錯誤原因
                        if attribute in ptnExRela:
                            if attribute in originRela:
                                # type 2 error
                                expResult[keyname][attribute]["et2"].append(
                                    ans["revid"])
                            else:
                                found = False
                                ptns = ptnExRela[
                                    attribute]  # get the patterns raise the Relation
                                for pid in ptns:
                                    for psbR in table[pid]["relations"]:
                                        if psbR == attribute:
                                            continue

                                        # here means that the pattern can raise a `correct' relation in answer, may it choice or not
                                        if domainRange[psbR][
                                                "domain"] in types and psbR in relation:
                                            found = True
                                            break

                                if found:
                                    # type 1 error
                                    expResult[keyname][attribute][
                                        "et1"].append(ans["revid"])
                                else:
                                    # type 3 error
                                    expResult[keyname][attribute][
                                        "et3"].append(ans["revid"])
                        else:
                            # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD
                            pass
                    else:
                        # ignore true-negative
                        pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
Exemple #8
0
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath,filename))

    print "Worker %d : Read %s into filter" % (jobid,filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info
    
    expResult = {}

    for deg in range(1,6):
        for typ in ["n","t"]:
            if not deg == 1:
                for amb in ["one","50","75","all"]:
                    keyname = "%d-%s-%s" % (deg,amb,typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg,typ)
                expResult[keyname] = copy.deepcopy(partAns)
    
    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])     # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]
        
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ    = args[2]

            # Relation extraction
            relaEx = []
            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue

                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                        continue

                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    if len(rfp) == 1:   # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    relaEx.append(rfp[0])
                            else:
                                relaEx.append(rfp[0])

                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx:
                                            relaEx.append(ptnst[0])
                                            break

                            
                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx:
                                    relaEx.append(rfp[0])
                                
                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx:
                                            relaEx.append(ptnst[0])
                                else:
                                    if not ptnst[0] in relaEx:
                                        relaEx.append(ptnst[0])
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5
                            
                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"])/float(b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx:
                                                relaEx.append(ptnst[0])
                                        else:
                                            if not ptnst[0] in relaEx:
                                                relaEx.append(ptnst[0])

            
            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True
            
                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(ans["revid"])
                else:
                    if postive:
                        expResult[keyname][attribute]["fp"].append(ans["revid"])
                    else:
                        # ignore true-negative
                        pass
    
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return expResult
def generate(inputSPIpath,inputTestPath,outputVSMpath,confidence):
    
    # Checking output path
    projizz.checkPath(outputVSMpath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)

    # Collect not used keys
    # because using 5-fold CV
    t = 0
    result = []
    for filename in os.listdir(inputTestPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputTestPath) )  )
            t += 1
    pool.close()
    pool.join()

    notUsedKeys = []
    for r in result:
        ks = r.get()
        notUsedKeys += ks

    ### Build Model
    # Paatern Selection
    modelArticles = projizz.buildYagoProperties([])
    words = []
    count = 0
    for filename in os.listdir(inputSPIpath):
        if ".json" in filename:
            ptnId = filename[:-5]

            # ignore invalidate pattern
            if not projizz.isPatternValidate(ptnId, table, confidence=confidence):
                continue

            count += 1
            print count,ptnId

            ptnInstance = projizz.jsonRead( os.path.join(inputSPIpath,filename) )
            for rela in ptnInstance:
                for key in ptnInstance[rela]:
                    # ignore in testing data's key
                    if key in notUsedKeys:
                        continue

                    for line in ptnInstance[rela][key]:
                        modelArticles[rela].append(line)
    
            if count%100 == 0:
                print "Read",count,"files"

    for relation in modelArticles:
        print relation
        projizz.jsonWrite(modelArticles[relation],os.path.join(outputVSMpath,"%s.txt" % (relation)))
Exemple #10
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath,filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath,filename))

    print "Worker %d : Read %s" % (jobid,filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0
    expResult = {}
    relaEx = {}

    # set thresholds
    for th in range(0,51,5):
        expResult[th] = copy.deepcopy(partAns)
        relaEx[th] = []
    
    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])     # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]
        
        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # TODO

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]
                lineText = article[line[0]]

                if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                    continue
        
                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                # TODO - Modlify string, remove pattern text in string?
                cosRlt = projizz.vsmSimilarity( lineText, vsmData, relas=rfp, ptntext=ptntks )

                # NOTE - if cosine value > threshold then there is a relation (?)
                for keyname in expResult:
                    threshold = float(keyname)/100.0

                    for pr in cosRlt:
                        # Check type
                        if domainRange[pr]["domain"] in types:
                            if cosRlt[pr] > threshold:
                                if pr not in relaEx[keyname]:
                                    relaEx[keyname].append(pr)

        #### Evaluation
        for keyname in expResult: 
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx[keyname]:
                    postive = True
                if attribute in relation:
                    true = True
            
                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(ans["revid"])
                else:
                    if postive:
                        expResult[keyname][attribute]["fp"].append(ans["revid"])
                    else:
                        # ignore true-negative
                        pass
    
        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return expResult
Exemple #11
0
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st,
                   domainRange, inputPath, confidence):
    # read patterns in articles
    contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r"))
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))

    print "Worker %d : Read %s into filter" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0

    # prepare keys for multiple-exp
    # degree: 1 ~ 5
    # ambigu: select 1, select n (threshold:.5, .75), select all
    # type or not: no type info, type info

    expResult = {}

    for deg in range(1, 6):
        for typ in ["n", "t"]:
            if not deg == 1:
                for amb in ["one", "50", "75", "all"]:
                    keyname = "%d-%s-%s" % (deg, amb, typ)
                    expResult[keyname] = copy.deepcopy(partAns)
            else:
                keyname = "%d-1-%s" % (deg, typ)
                expResult[keyname] = copy.deepcopy(partAns)

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        #targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        for keyname in expResult:

            args = keyname.split("-")
            degree = int(args[0])
            ambigu = args[1]
            typ = args[2]

            # Relation extraction
            relaEx = []
            for line in ptnEx:
                # line[0]: line number
                # line[1]: array of patterns

                lineText = article[line[0]]
                if lineText[
                        0] == "^":  # It's a wikipeida reference comments, ignore it!
                    continue

                for ptn in line[1]:
                    # ptn[0]: pattern ID
                    # ptn[1]: start position in line
                    # ptn[2]: end position in line

                    ptnId = "%d" % (ptn[0])

                    if not projizz.isPatternValidate(
                            ptnId, table, confidence=confidence, st=st):
                        continue

                    rfp = table[ptnId]["relations"]

                    # check degree
                    if len(rfp) > degree:
                        continue

                    if len(rfp) == 1:  # or degree == 1
                        if st[ptnId][0][1]["support"] > 0 and not rfp[
                                0] in relaEx:
                            if typ == "t":
                                if domainRange[rfp[0]]["domain"] in types:
                                    relaEx.append(rfp[0])
                            else:
                                relaEx.append(rfp[0])

                    else:
                        if ambigu == "one":
                            if typ == "t":
                                for ptnst in st[ptnId]:
                                    # ptnst[0] = relation
                                    # ptnst[1] = {"support": , "total": }
                                    if ptnst[1]["support"] > 0 and domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx:
                                            relaEx.append(ptnst[0])
                                            break

                            else:
                                if st[ptnId][0][1]["support"] > 0 and not rfp[
                                        0] in relaEx:
                                    relaEx.append(rfp[0])

                        elif ambigu == "all":
                            for ptnst in st[ptnId]:
                                if typ == "t":
                                    if domainRange[
                                            ptnst[0]]["domain"] in types:
                                        if not ptnst[0] in relaEx:
                                            relaEx.append(ptnst[0])
                                else:
                                    if not ptnst[0] in relaEx:
                                        relaEx.append(ptnst[0])
                        else:
                            th = 0.75
                            if ambigu == "50":
                                th = 0.5

                            b = st[ptnId][0][1]["support"]
                            if b > 0:
                                for ptnst in st[ptnId]:
                                    if float(ptnst[1]["support"]) / float(
                                            b) >= th:
                                        if typ == "t":
                                            if domainRange[ptnst[0]][
                                                    "domain"] in types and not ptnst[
                                                        0] in relaEx:
                                                relaEx.append(ptnst[0])
                                        else:
                                            if not ptnst[0] in relaEx:
                                                relaEx.append(ptnst[0])

            # Evaluation
            for attribute in expResult[keyname]:

                # special case, ignore.
                if attribute == "produced":
                    continue

                postive = False
                true = False

                if attribute in relaEx:
                    postive = True
                if attribute in relation:
                    true = True

                if true:
                    if postive:
                        expResult[keyname][attribute]["tp"].append(
                            ans["revid"])
                    else:
                        expResult[keyname][attribute]["fn"].append(
                            ans["revid"])
                else:
                    if postive:
                        expResult[keyname][attribute]["fp"].append(
                            ans["revid"])
                    else:
                        # ignore true-negative
                        pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult
def mapper(jobid,filename,inputPath,inputPtnPath,model,table,confidence):

    # Read article
    contentJson = projizz.jsonRead( os.path.join(inputPath,filename) )
    # Read ptn
    contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) )

    print "Worker %d : Read %s into filter" % (jobid,filename)
    
    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    supportInstanceByFile = {}

    linesByRelations = {}
    linesNoRelaByRelations = {}

    POS = {}
    NEG = {}


    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])
       
        supportInstanceByFile[key] = {}
        linesByRela = {}
        linesByNoRela = {}

        pos = {}
        neg = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(ptnId, table, confidence=confidence):
                    continue

                # give up degree > 5 's pattern
                if len(table[ptnId]["relations"]) > 5:
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:
       
                        # NOTE - remove pattern text.
                        if not rela in linesByRela:
                            linesByRela[rela] = {}
                        if not line[0] in linesByRela[rela]:
                            linesByRela[rela][line[0]] = []
                        if not ptnId in linesByRela[rela][line[0]]:
                            linesByRela[rela][line[0]].append(ptnId)

                        # For binary classifier
                        if not rela in pos:
                            pos[rela] = []
                        if not lineText[0] == "^" and line[0] not in pos[rela]:
                            pos[rela].append(line[0])

                    else:
                        if not rela in linesByNoRela:
                            linesByNoRela[rela] = {}
                        if not line[0] in linesByNoRela[rela]:
                            linesByNoRela[rela][line[0]] = []
                        if not ptnId in linesByNoRela[rela][line[0]]:
                            linesByNoRela[rela][line[0]].append(ptnId)
                        
                        # For binary classifier
                        if not rela in neg:
                            neg[rela] = []
                        if not lineText[0] == "^" and line[0] not in neg[rela]:
                            neg[rela].append(line[0])

        for rela in linesByRela:
            if not rela in linesByRelations:
                linesByRelations[rela] = []
            for lineN in linesByRela[rela]:
                text = projizz.getTokens( article[lineN].lower() )
                for ptnId in linesByRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesByRelations[rela].append(l)

        for rela in linesByNoRela:
            if not rela in linesNoRelaByRelations:
                linesNoRelaByRelations[rela] = []
            for lineN in linesByNoRela[rela]:
                text = projizz.getTokens( article[lineN].lower() )
                for ptnId in linesByNoRela[rela][lineN]:
                    ptntext = table[ptnId]["pattern"].split()
                    for ptntk in ptntext:
                        if ptntk in text:
                            text.remove(ptntk)
                l = ' '.join(text)
                linesNoRelaByRelations[rela].append(l)

        # For binary classifier
        for rela in pos:
            if not rela in POS:
                POS[rela] = []
            for lineN in pos[rela]:
                POS[rela].append( {"text":article[lineN],"label":"pos"} )

        for rela in neg:
            if not rela in NEG:
                NEG[rela] = []
            for lineN in neg[rela]:
                NEG[rela].append( {"text":article[lineN],"label":"neg"} )

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return linesByRelations,linesNoRelaByRelations,POS,NEG
def mapper(jobid,filename,inputPath,inputPtnPath,model,table):

    # Read article
    contentJson = projizz.jsonRead( os.path.join(inputPath,filename) )
    # Read ptn
    contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) )

    print "Worker %d : Read %s into filter" % (jobid,filename)
    
    ### Connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid":{"$in":queries}})
    print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count())

    count = 0

    supportInstanceByFile = {}

    for ans in itr:
        count += 1

        key = "%s.txt" % (ans["revid"])
        relation = ans["observed"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])
       
        supportInstanceByFile[key] = {}

        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns
            lineText = article[line[0]]

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line
                ptnId = "%d" % (ptn[0])

                if not projizz.isPatternValidate(ptnId, table):
                    continue

                for rela in table[ptnId]["relations"]:
                    # it's a support instance
                    if rela in relation:
                        
                        if not ptnId in supportInstanceByFile[key]:
                            supportInstanceByFile[key][ptnId] = {}
                        if not rela in supportInstanceByFile[key][ptnId]:
                            supportInstanceByFile[key][ptnId][rela] = []

                        if not line[0] in supportInstanceByFile[key][ptnId][rela]:
                            supportInstanceByFile[key][ptnId][rela].append(line[0])

        for ptnId in supportInstanceByFile[key]:
            for rela in supportInstanceByFile[key][ptnId]:
                lines = supportInstanceByFile[key][ptnId][rela]
                supportInstanceByFile[key][ptnId][rela] = []
                for lineN in lines:
                    supportInstanceByFile[key][ptnId][rela].append(article[lineN])

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid,count)

    return supportInstanceByFile
Exemple #14
0
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, nbcPath):

    # read articles and patterns
    contentJson = projizz.jsonRead(os.path.join(inputPath, filename))
    contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename))

    classifiers = projizz.getNBClassifiers(nbcPath)
    print "Worker %d : Read %s" % (jobid, filename)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    queries = map(lambda x: x[:-4], contentPtnJson)
    itr = collection.find({"revid": {"$in": queries}})
    print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count())

    count = 0
    expResult = partAns
    relaEx = []

    print "worker %d build expResult" % (jobid)

    for ans in itr:
        count += 1
        key = "%s.txt" % (ans["revid"])  # get revid
        # targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]

        # Now only consider properties, no references.
        relation = ans["observed"]

        # origin properties, 理論上應該會比 observed 還要多
        originRela = ans["properties"]

        ptnEx = contentPtnJson[key]
        article = projizz.articleSimpleSentenceFileter(contentJson[key])

        # Relation extraction
        for line in ptnEx:
            # line[0]: line number
            # line[1]: array of patterns

            lineText = article[line[0]]
            if lineText[0] == "^":  # It's a wikipeida reference comments, ignore it!
                continue

            for ptn in line[1]:
                # ptn[0]: pattern ID
                # ptn[1]: start position in line
                # ptn[2]: end position in line

                ptnId = "%d" % (ptn[0])

                ptntks = table[ptnId]["pattern"]

                if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st):
                    continue

                rfp = table[ptnId]["relations"]

                # check degree
                if len(rfp) > 5:
                    continue

                # if no support, ignore this pattern
                if st[ptnId][0][1]["support"] <= 0:
                    continue

                for ptnst in st[ptnId]:
                    # ptnst[0] = relation
                    # ptnst[1] = {"support":,"total": }
                    if domainRange[ptnst[0]] not in types:
                        continue

                    if classifiers[ptnst[0]] == None:
                        continue

                    if classifiers[ptnst[0]].classify(lineText) == "pos":
                        if not ptnst[0] in relaEx:
                            relaEx.append(ptnst[0])

        #### Evaluation
        for attribute in expResult:

            # special case, ignore.
            if attribute == "produced":
                continue

            postive = False
            true = False

            if attribute in relaEx:
                postive = True
            if attribute in relation:
                true = True

            if true:
                if postive:
                    expResult[attribute]["tp"].append(ans["revid"])
                else:
                    expResult[attribute]["fn"].append(ans["revid"])
            else:
                if postive:
                    expResult[attribute]["fp"].append(ans["revid"])
                else:
                    # ignore true-negative
                    pass

        if count % 100 == 0:
            print "worker #%d done %d." % (jobid, count)

    return expResult