Ejemplo n.º 1
0
def calculateOverlay():
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")
    
    overlay = {}
    for relation in projizz.getYagoRelation():
        overlay[relation] = {}
        for rela in projizz.getYagoRelation():
            overlay[relation][rela] = 0

    # Build table
    for pid in table:
        if table[pid]["used"]:
            if "eval" in table[pid] and not table[pid]["eval"]:
                continue
            for relation in table[pid]["relations"]:
                for rela in table[pid]["relations"]:
                    overlay[relation][rela] += 1
        else:
            pass

    for relaA in projizz.getYagoRelation():
        j = []
        for relaB in projizz.getYagoRelation():
            overC = overlay[relaA][relaB]
            j.append((relaB,overC))
        j.sort(key=lambda x:x[1],reverse=True)
        a = overlay[relaA][relaA]
        for relaB,overC in j:
            b = overlay[relaB][relaB]
            print "%s(%d) -> %s(%d) %d/%d %.5f (%d/%d,%.5f)" % (relaA,a,relaB,b,overC,a,float(overC)/a,overC,b,float(overC)/b)
Ejemplo n.º 2
0
def main(inputPtnPath,outputPath,pspath,inputPath):
    
    model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath):
    
    model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 4
0
def ambiguityDegree():
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    maxDegree = 0
    degree = {}
  
    for pid in table:
        if table[pid]["used"]:
            if "eval" in table[pid] and not table[pid]["eval"]:
                continue

            d = len(table[pid]["relations"])
            if not d in degree:
                degree[d] = []
                if d > maxDegree:
                    maxDegree = d
            
            degree[d].append(pid)


    for d in range(1,maxDegree+1):
        if not d in degree:
            print d,"0"
            continue
        #print d,len(degree[d])

        for pid in degree[d]:
            print "%d\t%s\t%s\t%s\t%.5f\t%d" % (d,pid,table[pid]["pattern"],table[pid]["relations"],table[pid]["confidence"],table[pid]["support"])
def main(inputPtnPath,outputPath):
    
    start_time = datetime.now()

    model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")

    properties = projizz.buildYagoProperties({})

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,copy.deepcopy(properties) )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()

        for rela in r:
            for ptnId in r[rela]:
                if not ptnId in properties[rela]:
                    properties[rela][ptnId] = {"total":0,"support":0}
                properties[rela][ptnId]["total"] += r[rela][ptnId]["total"]
                properties[rela][ptnId]["support"] += r[rela][ptnId]["support"]
   
    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 6
0
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename,nbcPath):
    
    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()


    start_time = datetime.now()

    cpuCount = multiprocessing.cpu_count()
    if cpuCount > 8:
        cpuCount = 8

    pool = multiprocessing.Pool(processes=cpuCount) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            #result.append(filterFunction(t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,classifiers ))
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath )))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]
                expResult[keyname][m]["et1"] += r[keyname][m]["et1"]
                expResult[keyname][m]["et2"] += r[keyname][m]["et2"]
                expResult[keyname][m]["et3"] += r[keyname][m]["et3"]


    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    for keyname in expResult:
        p = expResult[keyname]
        if not os.path.isdir(os.path.join(outputPath,keyname)):
            os.mkdir(os.path.join(outputPath,keyname))
        projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keyname))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 7
0
def main(inputPtnPath, outputPath, pspath):

    model, table = projizz.readPrefixTreeModelWithTable(
        "./yagoPatternTree.model", "./yagoPatternTree.table")
    properties = projizz.buildYagoProperties({
        "tp": [],
        "tn": [],
        "fp": [],
        "fn": []
    })
    sp = projizz.getSortedStatistic(projizz.jsonRead(pspath))
    validate = []

    # Get Top 200 Relation
    for relation in sp:
        count = 0
        for ptnId, ptnS in sp[relation]:
            ptnData = table[ptnId]
            if len(ptnData["relations"]) == 1:
                count += 1
                validate.append(ptnId)
            if count >= 200:
                break

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(
                pool.apply_async(filterFunction,
                                 (t, filename, inputPtnPath, model, table,
                                  partAns, validate)))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties, open(outputPath, "w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 8
0
def main():
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.patty.wiki.pattern

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table")
   
    for ptnId in table:
        ptnText = table[ptnId]["pattern"]
        queryRegexp = ptnText.replace("[","\\\\[").replace("]","\\\\]") + ";.*"
        itr = collection.find({"patterntext":{"$regex":queryRegexp}})
        print queryRegexp,itr.count()
Ejemplo n.º 9
0
def preprocess(inputPath,inputPtnPath,outputPath):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputPath,inputPtnPath, model, table))  )
            t += 1
    pool.close()
    pool.join()

    patternInstances = {}

    # Reducer
    for r in result:
        sibf = r.get()
        for key in sibf:
            for ptnId in sibf[key]:
                if not ptnId in patternInstances:
                    patternInstances[ptnId] = {}
                for rela in sibf[key][ptnId]:
                    for inst in sibf[key][ptnId][rela]:
                        if not rela in patternInstances[ptnId]:
                            patternInstances[ptnId][rela] = {}
                        if not key in patternInstances[ptnId][rela]:
                            patternInstances[ptnId][rela][key] = []
                        patternInstances[ptnId][rela][key].append(inst)

    
    # Write to files
    # NOTE
    # Output Format:
    # ptnId.json (json)
    # rela: keys
    #   key: line text
    for ptnId in patternInstances:
        projizz.jsonWrite(patternInstances[ptnId],os.path.join(outputPath,"%s.json" % (ptnId))) 

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename):
    
    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence )))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]
                expResult[keyname][m]["et1"] += r[keyname][m]["et1"]
                expResult[keyname][m]["et2"] += r[keyname][m]["et2"]
                expResult[keyname][m]["et3"] += r[keyname][m]["et3"]


    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    for keyname in expResult:
        p = expResult[keyname]
        if not os.path.isdir(os.path.join(outputPath,keyname)):
            os.mkdir(os.path.join(outputPath,keyname))
        projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keyname))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 11
0
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): 
    
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[]})
    domainRange = projizz.getYagoRelationDomainRange()
    idf,docs,lens = projizz.getVSMmodels(vsmPath)
    st = projizz.getSortedPatternStatistic( projizz.jsonRead(psfile) )
    vsmData = (idf, docs, lens)

    projizz.checkPath(outputPath)

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(mapper, ( t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData  )))
            #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData  ))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]


    for keyname in expResult:
        p = expResult[keyname]
        keydirName = "vsm-%d" % (keyname)
        projizz.checkPath( os.path.join(outputPath,keydirName))
        projizz.jsonWrite(p,os.path.join(outputPath,keydirName,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keydirName))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 12
0
def calculateOverlay():
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")
    
    overlay = {}
    for relation in projizz.getYagoRelation():
        overlay[relation] = {}
        for rela in projizz.getYagoRelation():
            overlay[relation][rela] = 0

    # Build table
    for pid in table:
        if table[pid]["used"]:
            if "eval" in table[pid] and not table[pid]["eval"]:
                continue
            for relation in table[pid]["relations"]:
                for rela in table[pid]["relations"]:
                    overlay[relation][rela] += 1
        else:
            pass

    yagoRela = projizz.getYagoRelation()
    yagoRela.sort()
    yagoRela.remove("produced")

    #print yagoRela

    print "      ",
    for i in range(13,25):
        print "& (%d)" % (i),
    print "\\\\"

    for relaA in yagoRela:
        j = []
        for relaB in yagoRela:
            overC = overlay[relaA][relaB]
            j.append((relaB,overC))
        #j.sort(key=lambda x:x[1],reverse=True)
        a = overlay[relaA][relaA]
        _id = yagoRela.index(relaA) + 1
        print "(%d) %s" % (_id,relaA),
        for relaB,overC in j:
            b = overlay[relaB][relaB]
            _tid = yagoRela.index(relaB) + 1
            if _tid < 13:
                continue
            #print "%s(%d) -> %s(%d) %d/%d %.5f (%d/%d,%.5f)" % (relaA,_id,relaB,b,overC,a,float(overC)/a,overC,b,float(overC)/b)
            print " & %2.2f" % (float(overC)/a),
        print "\\\\"
Ejemplo n.º 13
0
def main(inputPtnPath,outputPath,pspath):
    
    model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]})
    sp = projizz.getSortedStatistic(projizz.jsonRead(pspath))
    validate = []
   
    # Get Top 100 Relation
    for relation in sp:
        count = 0
        for ptnId,ptnS in sp[relation]:
            ptnData = table[ptnId]
            if len(ptnData["relations"]) == 1:
                count += 1
                validate.append(ptnId)
            if count >= 100:
                break

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,validate )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 14
0
def main(dataInputPath,ptnOutputPath):

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../yago/yagoPatternTree.table")

    if not os.path.isdir(ptnOutputPath):
        os.mkdir(ptnOutputPath)

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    print "Number of core: %d" % (multiprocessing.cpu_count())
    start_time = datetime.now()
    
    jobN = 0
    for filename in os.listdir(dataInputPath):
        if ".json" in filename:
            pool.apply_async(tryToFindRela, (jobN, filename, dataInputPath, ptnOutputPath, model, table))
            jobN+=1

    pool.close()
    pool.join()

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds,diff.microseconds)
def main(inputPtnPath, outputPath, confidence):

    start_time = datetime.now()

    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    # using new table (2014.06.16)
    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    properties = projizz.buildYagoProperties({})

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(filterFunction,
                                 (t, filename, inputPtnPath, model, table,
                                  copy.deepcopy(properties), confidence)))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()

        for rela in r:
            for ptnId in r[rela]:
                if not ptnId in properties[rela]:
                    properties[rela][ptnId] = {"total": 0, "support": 0}
                properties[rela][ptnId]["total"] += r[rela][ptnId]["total"]
                properties[rela][ptnId]["support"] += r[rela][ptnId]["support"]

    json.dump(properties, open(outputPath, "w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 16
0
def main(part, revid):

    # Paths (on NLG workstation)
    inputPath = "/tmp2/ccli/yago-part-%s/" % (part)
    inputPtnPath = "/tmp2/ccli/yago-ptn-part-%s/" % (part)
    spPath = "../yago/yagoPSv1/ps.%s.json" % (part)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    itr = collection.find({"revid": revid})

    # find filename
    a = os.popen("grep -nr \"%s\" %s" % (revid, inputPath)).readline()
    targetFilename = a.split(":")[0].split("/")[-1]
    key = "%s.txt" % (revid)

    pattern = projizz.jsonRead(inputPtnPath + targetFilename)[key]
    article = projizz.articleSimpleSentenceFileter(
        projizz.jsonRead(inputPath + targetFilename)[key])
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath))
    domainRange = projizz.getYagoRelationDomainRange()
    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table")

    print "Part %s, RevID=%s, in %s" % (part, revid, targetFilename)

    for ans in itr:

        targetName = ans["_id"].replace("(", "").replace(")", "").split(
            "_")  # get entity name's part
        types = ans["type"]
        answers = ans["properties"]

        print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"),
                                              targetName)
        print "Type=%s" % (types)
        print "Answer=%s" % (answers)

        for line in pattern:
            lineText = article[line[0]]
            named = False
            for namedToken in targetName:
                if namedToken in lineText:
                    named = True
                    break

            if not named:  # No target name in line text
                continue  # go to next line.

            for ptn in line[1]:
                ptnId = "%d" % (ptn[0])
                #rfp = table[ptnId]["relations"]
                if not ptnId in st:
                    continue

                for ps in st[ptnId]:
                    if float(ps[1]["support"]) / float(ps[1]["total"]) > 0:
                        if domainRange[ps[0]]["domain"] in types:
                            print "#%d" % (line[0]), lineText.encode("utf-8")
                            isIn = "(X)"
                            if ps[0] in answers:
                                isIn = "(O)"
                            print "%s %s/%s/{%d,%d}/ %s" % (
                                isIn, ptnId, table[ptnId]["pattern"],
                                ps[1]["support"], ps[1]["total"], ps[0])

                        pass

                    # select top 1
                    break

        # prevent second ans
        break
Ejemplo n.º 17
0
def preprocess(inputPath,inputPtnPath,outputPath,confidence):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputPath,inputPtnPath, model, table, confidence))  )
            #result.append( mapper(t,filename,inputPath,inputPtnPath, model, table, confidence))
            t += 1
    pool.close()
    pool.join()

    modelArticles = {}
    negAritcles = {}

    POSArticles = {}
    NEGArticles = {}

    # Reducer
    for r in result:
        sibr, osibr, p, n = r.get()

        for rela in sibr:
            if not rela in modelArticles:
                modelArticles[rela] = []
            modelArticles[rela] += sibr[rela]

        for rela in osibr:
            if not rela in negAritcles:
                negAritcles[rela] = []
            negAritcles[rela] += osibr[rela]

        for rela in p:
            if not rela in POSArticles:
                POSArticles[rela] = []
            POSArticles[rela] += p[rela]

        for rela in n:
            if not rela in NEGArticles:
                NEGArticles[rela] = []
            NEGArticles[rela] += n[rela]

    #
    #   relation.json: [line, line, line, ....]
    #

    for rela in modelArticles:
        print rela
        projizz.jsonWrite(modelArticles[rela],os.path.join(outputPath,"%s.json" % (rela))) 

    for rela in negAritcles:
        print rela
        projizz.jsonWrite(negAritcles[rela],os.path.join(outputPath,"%s.other" % (rela))) 

    for rela in POSArticles:
        print rela
        projizz.jsonWrite(POSArticles[rela],os.path.join(outputPath,"%s.pos" % (rela)))
        
    for rela in NEGArticles:
        print rela
        projizz.jsonWrite(NEGArticles[rela],os.path.join(outputPath,"%s.neg" % (rela)))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 18
0
def main(part,revid):

    # Paths (on NLG workstation)
    inputPath = "/tmp2/ccli/y-part-%s/" % (part)
    inputPtnPath = "/tmp2/ccli/y-ptn-part-%s/" % (part)
    spPath = "../yago/yagoPSv2/ps.%s.json" % (part)

    # connect to database
    connect = pymongo.Connection()
    db = connect.projizz
    collection = db.result.yago.answer
    itr = collection.find({"revid":revid})

    # find filename
    a = os.popen("grep -nr \"%s\" %s" % (revid,inputPath)).readline()
    targetFilename = a.split(":")[0].split("/")[-1]
    key = "%s.txt" % (revid)

    

    pattern = projizz.jsonRead(inputPtnPath+targetFilename)[key]
    article = projizz.articleSimpleSentenceFileter(projizz.jsonRead(inputPath+targetFilename)[key])
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath))
    domainRange = projizz.getYagoRelationDomainRange();
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../yago/yagoPatternTree.table")

    print "Part %s, RevID=%s, in %s" % (part,revid,targetFilename)

    for ans in itr:

        targetName = ans["_id"].replace("(","").replace(")","").split("_")  # get entity name's part
        types = ans["type"]
        answers = ans["observed"]

        print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"),targetName)
        print "Type=%s" % (types)
        print "Answer=%s" % (answers)
       
        for line in pattern:
            lineText = article[line[0]]
            named = False
            for namedToken in targetName:
                if namedToken in lineText:
                    named = True
                    break

            if not named:   # No target name in line text
                continue    # go to next line.

            for ptn in line[1]:
                ptnId = "%d" % (ptn[0])
                #rfp = table[ptnId]["relations"]
                if not ptnId in st:
                    continue

                for ps in st[ptnId]:
                    if float(ps[1]["support"])/float(ps[1]["total"]) > 0:
                        if domainRange[ps[0]]["domain"] in types:
                            print "#%d" % (line[0]),lineText.encode("utf-8")
                            isIn = "(X)"
                            if ps[0] in answers:
                                isIn = "(O)"
                            print "%s %s/%s/{%d,%d}/ %s" % (isIn,ptnId,table[ptnId]["pattern"],ps[1]["support"],ps[1]["total"],ps[0])

                        pass

                    # select top 1
                    break


        # prevent second ans
        break
Ejemplo n.º 19
0
def preprocess(inputPath,outputPath):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputPath,outputPath, model, table))  )
            t += 1
    pool.close()
    pool.join()

    words = {}
    idf = {}
    tfs = {}

    # Reducer - DF
    types = 0
    for r in result:
        fn,tks = r.get()
        tfs[fn] = tks
        types += 1

        for t in tks:
            if t not in words:
                words[t] = 0
            words[t] += 1

    print "Doc#",types,"words#",len(words)

    projizz.jsonWrite(words,os.path.join(outputPath,"documentFreq.df"))

    # Calculate idf
    for w in words:
        if words[w] == 0:
            continue

        idf[w] = math.log(float(types)/float(words[w]),10)
    
    projizz.jsonWrite(idf,os.path.join(outputPath,"idf.idf"))
    print "Write out idf file"

    # Calculate td-idf weight
    for fn in tfs:
        tks = tfs[fn]
        weight = {}
        for t in tks:
            tf = tks[t]
            if t not in idf:
                continue
            
            weight[t] = tf * idf[t]

        projizz.jsonWrite(weight,os.path.join(outputPath,fn))
        print "build",fn,"tf-idf weight"


    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath):
    
    start_time = datetime.now()

    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")

    properties = projizz.buildYagoProperties({})

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,copy.deepcopy(properties) )))
            t += 1
    pool.close()
    pool.join()

    statistics = {}

    counter = 0
    for res in result:
        r,s = res.get()
        counter += 1
        print "%d / %d ..." % (counter,len(result))

        for degree in r:
            if not degree in properties:
                properties[degree] = {}

            for ptnId in r[degree]:
                if not ptnId in properties[degree]:
                    properties[degree][ptnId] = {"occ":[],"sup":{}}

                # give up the code below for speed up
                continue

                for occId in r[degree][ptnId]["occ"]:
                    if not occId in properties[degree][ptnId]["occ"]:
                        properties[degree][ptnId]["occ"].append(occId)

                for rela in r[degree][ptnId]["sup"]:
                    if not rela in properties[degree][ptnId]["sup"]:
                        properties[degree][ptnId]["sup"][rela] = []
                    for supId in r[degree][ptnId]["sup"][rela]:
                        if not supId in properties[degree][ptnId]["sup"][rela]:
                            properties[degree][ptnId]["sup"][rela].append(supId)

        for aid in s:
            statistics[aid] = s[aid]

    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)

    ptnNum = 0
    occDocs = []

    ptnDeg = {}

    for ptnId in table: 
        if table[ptnId]["used"]:
            if "eval" in table[ptnId] and not table[ptnId]["eval"]:
                continue

            if table[ptnId]["confidence"] > c_confidence:
                deg = len(table[ptnId]["relations"])
                if not deg in ptnDeg:
                    ptnDeg[deg] = 0

                ptnDeg[deg] += 1


    for degree in range(1,18):
        if not degree in properties:
            print "%d\t%d\t%d\t%d\t%d" % (degree,0,0,0,0)
        else:
            occ = 0
            occs = 0
            for a in statistics:
                ds = statistics[a]
                if degree in ds:
                    occ += 1
                for i in ds:
                    if i <= degree:
                        occs += 1
                        break

            print "%d\t%d\t%d\t%d\t%d" % (degree,ptnDeg[degree],len(properties[degree]),occ,occs)

    # Calculate the number of articles which has no pattern.
    noPtnCount = 0
    for a in statistics:
        ds = statistics[a]
        if len(ds) == 0:
            noPtnCount += 1

    print noPtnCount,len(statistics)
Ejemplo n.º 21
0
# -*- coding: utf-8 -*-
# qcl
#

import sys
import projizz

if len(sys.argv) <= 1:
    print "$ python ./simpleSortedViewer.py [ps json]"
else:
    filename = sys.argv[1]
    ps = projizz.jsonRead(filename)
    sortedp = projizz.getSortedStatistic(ps)
    model,table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    for relation in sortedp:
        print relation
        for ptnId,ptnS in sortedp[relation]:
            print "%s\t%s %s %s" % (relation,table[ptnId]["pattern"],ptnId,ptnS)
Ejemplo n.º 22
0
def parseYagoData():
    
    phase = "used"
    
    if phase == "build":
        model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table")
    else:
        model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "./yagoPatternTreeWithConfidence.table")
    
    # function testing.
    #test = "has appeared like [[num]]"
    ##test = "has appeared like [[num]"
    #i = projizz.naiveMatchPattern(test,model) 
    #print i


    a = table.keys()
    originL = len(a)

    ptnByRelation = {}

    

    for relation in projizz.getYagoRelation():
        if not phase == "build":
            break
        
        f = open("./yagoRela/%s.txt" % (relation))
        
        print relation

        text = f.readline()
        ptnSynsetTxt = text.split("\",\" ")[1:]
        ptnSynsetTxt = ptnSynsetTxt[:-1] + [ ptnSynsetTxt[-1][:-7] ]

        ptnByRelation[relation] = []
        

        evC = 0
        for text in ptnSynsetTxt:
            ptns = text.split("#")
            # ptns[1] : pattern synset id in patty
            # ptns[3] : pattern domain
            # ptns[4] : pattern plain text
            # ptns[5] : pattern range
            # pnts[6] : confidence
            # ptns[7] : support co-occurrence
            # ptns[8] : some has, I guess it is eval result.
            if len(ptns) > 8:
                evC += 1

            patterns = ptns[4].split(";%")
            patterns = patterns[:-1] + [patterns[-1][:-1]]

            for pattern in patterns:
                pid = projizz.naiveMatchPattern(pattern,model)
                if pid < 0:
                    pass
                    #print relation,pattern
                else:
                    pid = str(pid)
                    if pid in a:
                        a.remove(pid)
                    if not pid in ptnByRelation[relation]:
                        ptnByRelation[relation].append(pid)

                    if not relation in table[pid]["relations"]:
                        table[pid]["relations"].append(relation)
                        #print relation,pid,pattern

                    ptnS = table[pid]
                    if not "confidence" in ptnS:
                        table[pid]["confidence"] = float(ptns[6])
                        table[pid]["support"] = int(ptns[7])
                        table[pid]["used"] = True
            
                        if len(ptns) > 8:
                            if ptns[8] == "false":
                                table[pid]["eval"] = False
                                #print pid,table[pid]["relations"],pattern,ptns[8]
                            else:
                                table[pid]["eval"] = True

        f.close()

    if phase == "build":

        for pid in a:
            table[pid]["used"] = False
    
        for pid in table:
            if table[pid]["used"]:
                needRemove = []
                for relation in table[pid]["relations"]:
                    if not pid in ptnByRelation[relation]:
                        print pid,table[pid]["pattern"],relation
                        needRemove.append(relation)
                for p in needRemove:
                    table[pid]["relations"].remove(p)
                if len(table[pid]["relations"]) == 0:
                    print pid,table[pid]["pattern"],"!!!"
            else:
                pass

        projizz.jsonWrite(table,"./yagoPatternTreeWithConfidence.table")

    else:
        c = 0
        used = 0
        for pid in table:
            if table[pid]["used"]:
                # 如果有true或false在,就只留True的Pattern
                if "eval" in table[pid]:
                    if not table[pid]["eval"]:
                        continue
                used += 1
                for relation in table[pid]["relations"]:
                    if not relation in ptnByRelation:
                        ptnByRelation[relation] = []
                    if not pid in ptnByRelation[relation]:
                        ptnByRelation[relation].append(pid)
            else:
                c += 1

    # 一些小計算
    #for relation in ptnByRelation:
    #    print relation,len(ptnByRelation[relation])
    
    # 找最高(意思就是不能再更高了)信心值
    # 每組Relation的最高之中最小的那一個

    minC = 1.0
    minCR = ""
    for relation in ptnByRelation:
        c75 = 0
        c50 = 0
        ptns = []
        for pid in ptnByRelation[relation]:
            ptns.append(table[pid])
            ptns[-1]["pid"] = pid
        ptns.sort(key=lambda x:x["confidence"],reverse=True)
        if ptns[0]["confidence"] < minC:
            minC = ptns[0]["confidence"]
            minCR = relation
        
        #print relation,ptns[0]
        f = open("./yagoSortedRela/%s.txt" % (relation),"w")
        for ptn in ptns:
            if ptn["confidence"] > .75:
                c75 += 1
            if ptn["confidence"] > .5:
                c50 += 1
            f.write("%s\t%s\t%.3f\t%d\t%s\n" % (ptn["pid"],ptn["pattern"],ptn["confidence"],ptn["support"],ptn["relations"]))
        f.close()

        print relation,len(ptns),c75,c50

    print minCR,minC,"pattern used:",used
Ejemplo n.º 23
0
def generate(inputSPIpath,inputTestPath,outputVSMpath,confidence):
    
    # Checking output path
    projizz.checkPath(outputVSMpath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)

    # Collect not used keys
    # because using 5-fold CV
    t = 0
    result = []
    for filename in os.listdir(inputTestPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputTestPath) )  )
            t += 1
    pool.close()
    pool.join()

    notUsedKeys = []
    for r in result:
        ks = r.get()
        notUsedKeys += ks

    ### Build Model
    # Paatern Selection
    modelArticles = projizz.buildYagoProperties([])
    words = []
    count = 0
    for filename in os.listdir(inputSPIpath):
        if ".json" in filename:
            ptnId = filename[:-5]

            # ignore invalidate pattern
            if not projizz.isPatternValidate(ptnId, table, confidence=confidence):
                continue

            count += 1
            print count,ptnId

            ptnInstance = projizz.jsonRead( os.path.join(inputSPIpath,filename) )
            for rela in ptnInstance:
                for key in ptnInstance[rela]:
                    # ignore in testing data's key
                    if key in notUsedKeys:
                        continue

                    for line in ptnInstance[rela][key]:
                        modelArticles[rela].append(line)
    
            if count%100 == 0:
                print "Read",count,"files"

    for relation in modelArticles:
        print relation
        projizz.jsonWrite(modelArticles[relation],os.path.join(outputVSMpath,"%s.txt" % (relation)))
Ejemplo n.º 24
0
def main(inputPtnPath, outputPath):

    start_time = datetime.now()

    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    properties = projizz.buildYagoProperties({})

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(filterFunction,
                                 (t, filename, inputPtnPath, model, table,
                                  copy.deepcopy(properties))))
            t += 1
    pool.close()
    pool.join()

    statistics = {}

    counter = 0
    for res in result:
        r, s = res.get()
        counter += 1
        print "%d / %d ..." % (counter, len(result))

        for degree in r:
            if not degree in properties:
                properties[degree] = {}

            for ptnId in r[degree]:
                if not ptnId in properties[degree]:
                    properties[degree][ptnId] = {"occ": [], "sup": {}}

                # give up the code below for speed up
                continue

                for occId in r[degree][ptnId]["occ"]:
                    if not occId in properties[degree][ptnId]["occ"]:
                        properties[degree][ptnId]["occ"].append(occId)

                for rela in r[degree][ptnId]["sup"]:
                    if not rela in properties[degree][ptnId]["sup"]:
                        properties[degree][ptnId]["sup"][rela] = []
                    for supId in r[degree][ptnId]["sup"][rela]:
                        if not supId in properties[degree][ptnId]["sup"][rela]:
                            properties[degree][ptnId]["sup"][rela].append(
                                supId)

        for aid in s:
            statistics[aid] = s[aid]

    json.dump(properties, open(outputPath, "w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)

    ptnNum = 0
    occDocs = []

    ptnDeg = {}

    for ptnId in table:
        if table[ptnId]["used"]:
            if "eval" in table[ptnId] and not table[ptnId]["eval"]:
                continue

            if table[ptnId]["confidence"] > c_confidence:
                deg = len(table[ptnId]["relations"])
                if not deg in ptnDeg:
                    ptnDeg[deg] = 0

                ptnDeg[deg] += 1

    for degree in range(1, 18):
        if not degree in properties:
            print "%d\t%d\t%d\t%d\t%d" % (degree, 0, 0, 0, 0)
        else:
            occ = 0
            occs = 0
            for a in statistics:
                ds = statistics[a]
                if degree in ds:
                    occ += 1
                for i in ds:
                    if i <= degree:
                        occs += 1
                        break

            print "%d\t%d\t%d\t%d\t%d" % (degree, ptnDeg[degree],
                                          len(properties[degree]), occ, occs)

    # Calculate the number of articles which has no pattern.
    noPtnCount = 0
    for a in statistics:
        ds = statistics[a]
        if len(ds) == 0:
            noPtnCount += 1

    print noPtnCount, len(statistics)
Ejemplo n.º 25
0
# -*- coding: utf-8 -*-
# qcl
#

import sys
import projizz

if len(sys.argv) <= 1:
    print "$ python ./simpleSortedViewer.py [ps json]"
else:
    filename = sys.argv[1]
    ps = projizz.jsonRead(filename)
    sortedp = projizz.getSortedStatistic(ps)
    model, table = projizz.readPrefixTreeModelWithTable(
        "./yagoPatternTree.model", "./yagoPatternTree.table")
    for relation in sortedp:
        print relation
        for ptnId, ptnS in sortedp[relation]:
            print "%s\t%s %s %s" % (relation, table[ptnId]["pattern"], ptnId,
                                    ptnS)
Ejemplo n.º 26
0
def generate(inputSPIpath, inputTestPath, outputVSMpath, confidence):

    # Checking output path
    projizz.checkPath(outputVSMpath)

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)

    # Collect not used keys
    # because using 5-fold CV
    t = 0
    result = []
    for filename in os.listdir(inputTestPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(mapper, (t, filename, inputTestPath)))
            t += 1
    pool.close()
    pool.join()

    notUsedKeys = []
    for r in result:
        ks = r.get()
        notUsedKeys += ks

    ### Build Model
    # Paatern Selection
    modelArticles = projizz.buildYagoProperties([])
    words = []
    count = 0
    for filename in os.listdir(inputSPIpath):
        if ".json" in filename:
            ptnId = filename[:-5]

            # ignore invalidate pattern
            if not projizz.isPatternValidate(
                    ptnId, table, confidence=confidence):
                continue

            count += 1
            print count, ptnId

            ptnInstance = projizz.jsonRead(os.path.join(
                inputSPIpath, filename))
            for rela in ptnInstance:
                for key in ptnInstance[rela]:
                    # ignore in testing data's key
                    if key in notUsedKeys:
                        continue

                    for line in ptnInstance[rela][key]:
                        modelArticles[rela].append(line)

            if count % 100 == 0:
                print "Read", count, "files"

    for relation in modelArticles:
        print relation
        projizz.jsonWrite(modelArticles[relation],
                          os.path.join(outputVSMpath, "%s.txt" % (relation)))