Esempio n. 1
0
def buildModel(inputPtnPath, outputPath):
    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    statisticFiles = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            statisticFiles.append(filename)
    statisticFiles.sort()

    for filename in statisticFiles:
        properties = projizz.buildYagoProperties({})
        for anotherFilename in statisticFiles:
            if filename == anotherFilename:
                continue
            p = json.load(
                open(os.path.join(inputPtnPath, anotherFilename), "r"))
            for rela in p:
                for ptnId in p[rela]:
                    if not ptnId in properties[rela]:
                        properties[rela][ptnId] = {"total": 0, "support": 0}
                    properties[rela][ptnId]["total"] += p[rela][ptnId]["total"]
                    properties[rela][ptnId]["support"] += p[rela][ptnId][
                        "support"]

        #for relation in properties:
        #    ptns = properties[relation]
        #    print ptns.items()

        print filename
        json.dump(properties, open(os.path.join(outputPath, filename), "w"))
def buildModel(inputPtnPath, outputPath):
    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    statisticFiles = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            statisticFiles.append(filename)
    statisticFiles.sort()

    for filename in statisticFiles:
        properties = projizz.buildYagoProperties({})
        for anotherFilename in statisticFiles:
            if filename == anotherFilename:
                continue
            p = json.load(open(os.path.join(inputPtnPath,anotherFilename),"r"))
            for rela in p:
                for ptnId in p[rela]:
                    if not ptnId in properties[rela]:
                        properties[rela][ptnId] = {"total":0,"support":0}
                    properties[rela][ptnId]["total"] += p[rela][ptnId]["total"]
                    properties[rela][ptnId]["support"] += p[rela][ptnId]["support"]
            
        #for relation in properties:
        #    ptns = properties[relation]
        #    print ptns.items()

        print filename
        json.dump(properties,open(os.path.join(outputPath,filename),"w"))
def main(inputPtnPath,outputPath,pspath,inputPath):
    
    model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath):
    
    start_time = datetime.now()

    model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")

    properties = projizz.buildYagoProperties({})

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,copy.deepcopy(properties) )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()

        for rela in r:
            for ptnId in r[rela]:
                if not ptnId in properties[rela]:
                    properties[rela][ptnId] = {"total":0,"support":0}
                properties[rela][ptnId]["total"] += r[rela][ptnId]["total"]
                properties[rela][ptnId]["support"] += r[rela][ptnId]["support"]
   
    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath):
    
    model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Esempio n. 6
0
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename,nbcPath):
    
    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()


    start_time = datetime.now()

    cpuCount = multiprocessing.cpu_count()
    if cpuCount > 8:
        cpuCount = 8

    pool = multiprocessing.Pool(processes=cpuCount) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            #result.append(filterFunction(t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,classifiers ))
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath )))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]
                expResult[keyname][m]["et1"] += r[keyname][m]["et1"]
                expResult[keyname][m]["et2"] += r[keyname][m]["et2"]
                expResult[keyname][m]["et3"] += r[keyname][m]["et3"]


    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    for keyname in expResult:
        p = expResult[keyname]
        if not os.path.isdir(os.path.join(outputPath,keyname)):
            os.mkdir(os.path.join(outputPath,keyname))
        projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keyname))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Esempio n. 7
0
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath,
         outputFilename):

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp": [], "fp": [], "fn": []})
    domainRange = projizz.getYagoRelationDomainRange()
    idf, docs, lens = projizz.getVSMmodels(vsmPath)
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(psfile))
    vsmData = (idf, docs, lens)

    projizz.checkPath(outputPath)

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(
                pool.apply_async(
                    mapper, (t, filename, inputPath, inputPtnPath, table, st,
                             partAns, domainRange, confidence, vsmData)))
            #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData  ))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]

    for keyname in expResult:
        p = expResult[keyname]
        keydirName = "vsm-%d" % (keyname)
        projizz.checkPath(os.path.join(outputPath, keydirName))
        projizz.jsonWrite(p,
                          os.path.join(outputPath, keydirName, outputFilename))
        print "start write out to %s" % (os.path.join(outputPath, keydirName))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Esempio n. 8
0
def main(inputPtnPath, outputPath, pspath):

    model, table = projizz.readPrefixTreeModelWithTable(
        "./yagoPatternTree.model", "./yagoPatternTree.table")
    properties = projizz.buildYagoProperties({
        "tp": [],
        "tn": [],
        "fp": [],
        "fn": []
    })
    sp = projizz.getSortedStatistic(projizz.jsonRead(pspath))
    validate = []

    # Get Top 200 Relation
    for relation in sp:
        count = 0
        for ptnId, ptnS in sp[relation]:
            ptnData = table[ptnId]
            if len(ptnData["relations"]) == 1:
                count += 1
                validate.append(ptnId)
            if count >= 200:
                break

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(
                pool.apply_async(filterFunction,
                                 (t, filename, inputPtnPath, model, table,
                                  partAns, validate)))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties, open(outputPath, "w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename):
    
    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]})
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath))
    domainRange = projizz.getYagoRelationDomainRange()

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence )))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]
                expResult[keyname][m]["et1"] += r[keyname][m]["et1"]
                expResult[keyname][m]["et2"] += r[keyname][m]["et2"]
                expResult[keyname][m]["et3"] += r[keyname][m]["et3"]


    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    for keyname in expResult:
        p = expResult[keyname]
        if not os.path.isdir(os.path.join(outputPath,keyname)):
            os.mkdir(os.path.join(outputPath,keyname))
        projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keyname))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Esempio n. 10
0
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): 
    
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[]})
    domainRange = projizz.getYagoRelationDomainRange()
    idf,docs,lens = projizz.getVSMmodels(vsmPath)
    st = projizz.getSortedPatternStatistic( projizz.jsonRead(psfile) )
    vsmData = (idf, docs, lens)

    projizz.checkPath(outputPath)

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(mapper, ( t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData  )))
            #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData  ))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]


    for keyname in expResult:
        p = expResult[keyname]
        keydirName = "vsm-%d" % (keyname)
        projizz.checkPath( os.path.join(outputPath,keydirName))
        projizz.jsonWrite(p,os.path.join(outputPath,keydirName,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keydirName))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Esempio n. 11
0
def main(inputFile):
    fp = open(inputFile, 'r')

    yagoRelations = projizz.buildYagoProperties({})

    c = []
    j = 0
    for line in fp:
        if line[0] == "#" or line[0] == "@" or line[0] == "\n":
            continue
        #if line[0:29] != "<http://dbpedia.org/resource/":
        #    continue

        prefix = "@prefix dbp: <http://dbpedia.org/ontology/> .\n@prefix owl: <http://www.w3.org/2002/07/owl#> .\n@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n@prefix skos: <http://www.w3.org/2004/02/skos/core#> .\n@prefix xsd: <http://www.w3.org/2001/XMLSchema#> ."

        g = rdflib.Graph()
        r = g.parse(data=prefix + line, format="n3")
        for instance in r:
            if not "rdf-schema#" in instance[1] or not instance[0][
                    42:] in yagoRelations:
                continue

            j += 1

            yagoRelation = "%s" % (instance[0][42:])
            relaProperty = "%s" % (instance[1][37:])
            if "wl#Thin" in "%s" % (instance[2]):
                content = "wl:Thin"
            else:
                content = "%s" % (instance[2][42:])

            if not "domain" in relaProperty and not "range" in relaProperty:
                continue

            yagoRelations[yagoRelation][relaProperty] = content

    print "Read", j, "lines. Finished."
    yagoRelations["produced"]["domain"] = ""
    yagoRelations["produced"]["range"] = ""
    for prop in yagoRelations:
        print prop, yagoRelations[prop]["domain"], yagoRelations[prop]["range"]

    f = open("../projizz/yagoRelation.py", "w")
    f.write(
        "# -*- coding: utf-8 -*-\n# qcl, automatically generated by ../yago/parse.yago.schema.py\n# the domain and range of each yago relation used in projizz\n"
    )
    f.write("yagoDomainRange = %s\n" % (yagoRelations))
def main(inputFile):
    fp = open(inputFile,'r')

    yagoRelations = projizz.buildYagoProperties({})

    c = []
    j = 0
    for line in fp:
        if line[0] == "#" or line[0] == "@" or line[0] == "\n":
            continue
        #if line[0:29] != "<http://dbpedia.org/resource/":
        #    continue

        prefix = "@prefix dbp: <http://dbpedia.org/ontology/> .\n@prefix owl: <http://www.w3.org/2002/07/owl#> .\n@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n@prefix skos: <http://www.w3.org/2004/02/skos/core#> .\n@prefix xsd: <http://www.w3.org/2001/XMLSchema#> ."

        g = rdflib.Graph()
        r = g.parse(data=prefix+line,format="n3")
        for instance in r:
            if not "rdf-schema#" in instance[1] or not instance[0][42:] in yagoRelations:
                continue
            
            j += 1

            yagoRelation    = "%s" % (instance[0][42:])
            relaProperty    = "%s" % (instance[1][37:])
            if "wl#Thin" in "%s" % (instance[2]):
                content         = "wl:Thin"
            else:
                content         = "%s" % (instance[2][42:])

            if not "domain" in relaProperty and not "range" in relaProperty:
                continue

            yagoRelations[yagoRelation][relaProperty] = content

    print "Read",j,"lines. Finished."
    yagoRelations["produced"]["domain"] = ""
    yagoRelations["produced"]["range"] = ""
    for prop in yagoRelations:
        print prop,yagoRelations[prop]["domain"],yagoRelations[prop]["range"]

    f = open("../projizz/yagoRelation.py","w")
    f.write("# -*- coding: utf-8 -*-\n# qcl, automatically generated by ../yago/parse.yago.schema.py\n# the domain and range of each yago relation used in projizz\n")
    f.write("yagoDomainRange = %s\n" % (yagoRelations))
def main(inputPtnPath,outputPath,pspath):
    
    model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]})
    sp = projizz.getSortedStatistic(projizz.jsonRead(pspath))
    validate = []
   
    # Get Top 100 Relation
    for relation in sp:
        count = 0
        for ptnId,ptnS in sp[relation]:
            ptnData = table[ptnId]
            if len(ptnData["relations"]) == 1:
                count += 1
                validate.append(ptnId)
            if count >= 100:
                break

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,validate )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m]["tp"] += r[m]["tp"]
            properties[m]["tn"] += r[m]["tn"]
            properties[m]["fp"] += r[m]["fp"]
            properties[m]["fn"] += r[m]["fn"]

    print "start write out to %s" % (outputPath)
    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath, outputPath, confidence):

    start_time = datetime.now()

    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    # using new table (2014.06.16)
    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    properties = projizz.buildYagoProperties({})

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(filterFunction,
                                 (t, filename, inputPtnPath, model, table,
                                  copy.deepcopy(properties), confidence)))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()

        for rela in r:
            for ptnId in r[rela]:
                if not ptnId in properties[rela]:
                    properties[rela][ptnId] = {"total": 0, "support": 0}
                properties[rela][ptnId]["total"] += r[rela][ptnId]["total"]
                properties[rela][ptnId]["support"] += r[rela][ptnId]["support"]

    json.dump(properties, open(outputPath, "w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Esempio n. 15
0
def generate(inputSPIpath,inputTestPath,outputVSMpath,confidence):
    
    # Checking output path
    projizz.checkPath(outputVSMpath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)

    # Collect not used keys
    # because using 5-fold CV
    t = 0
    result = []
    for filename in os.listdir(inputTestPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputTestPath) )  )
            t += 1
    pool.close()
    pool.join()

    notUsedKeys = []
    for r in result:
        ks = r.get()
        notUsedKeys += ks

    ### Build Model
    # Paatern Selection
    modelArticles = projizz.buildYagoProperties([])
    words = []
    count = 0
    for filename in os.listdir(inputSPIpath):
        if ".json" in filename:
            ptnId = filename[:-5]

            # ignore invalidate pattern
            if not projizz.isPatternValidate(ptnId, table, confidence=confidence):
                continue

            count += 1
            print count,ptnId

            ptnInstance = projizz.jsonRead( os.path.join(inputSPIpath,filename) )
            for rela in ptnInstance:
                for key in ptnInstance[rela]:
                    # ignore in testing data's key
                    if key in notUsedKeys:
                        continue

                    for line in ptnInstance[rela][key]:
                        modelArticles[rela].append(line)
    
            if count%100 == 0:
                print "Read",count,"files"

    for relation in modelArticles:
        print relation
        projizz.jsonWrite(modelArticles[relation],os.path.join(outputVSMpath,"%s.txt" % (relation)))
Esempio n. 16
0
def main(inputPtnPath, outputPath):

    start_time = datetime.now()

    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    properties = projizz.buildYagoProperties({})

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(filterFunction,
                                 (t, filename, inputPtnPath, model, table,
                                  copy.deepcopy(properties))))
            t += 1
    pool.close()
    pool.join()

    statistics = {}

    counter = 0
    for res in result:
        r, s = res.get()
        counter += 1
        print "%d / %d ..." % (counter, len(result))

        for degree in r:
            if not degree in properties:
                properties[degree] = {}

            for ptnId in r[degree]:
                if not ptnId in properties[degree]:
                    properties[degree][ptnId] = {"occ": [], "sup": {}}

                # give up the code below for speed up
                continue

                for occId in r[degree][ptnId]["occ"]:
                    if not occId in properties[degree][ptnId]["occ"]:
                        properties[degree][ptnId]["occ"].append(occId)

                for rela in r[degree][ptnId]["sup"]:
                    if not rela in properties[degree][ptnId]["sup"]:
                        properties[degree][ptnId]["sup"][rela] = []
                    for supId in r[degree][ptnId]["sup"][rela]:
                        if not supId in properties[degree][ptnId]["sup"][rela]:
                            properties[degree][ptnId]["sup"][rela].append(
                                supId)

        for aid in s:
            statistics[aid] = s[aid]

    json.dump(properties, open(outputPath, "w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)

    ptnNum = 0
    occDocs = []

    ptnDeg = {}

    for ptnId in table:
        if table[ptnId]["used"]:
            if "eval" in table[ptnId] and not table[ptnId]["eval"]:
                continue

            if table[ptnId]["confidence"] > c_confidence:
                deg = len(table[ptnId]["relations"])
                if not deg in ptnDeg:
                    ptnDeg[deg] = 0

                ptnDeg[deg] += 1

    for degree in range(1, 18):
        if not degree in properties:
            print "%d\t%d\t%d\t%d\t%d" % (degree, 0, 0, 0, 0)
        else:
            occ = 0
            occs = 0
            for a in statistics:
                ds = statistics[a]
                if degree in ds:
                    occ += 1
                for i in ds:
                    if i <= degree:
                        occs += 1
                        break

            print "%d\t%d\t%d\t%d\t%d" % (degree, ptnDeg[degree],
                                          len(properties[degree]), occ, occs)

    # Calculate the number of articles which has no pattern.
    noPtnCount = 0
    for a in statistics:
        ds = statistics[a]
        if len(ds) == 0:
            noPtnCount += 1

    print noPtnCount, len(statistics)
Esempio n. 17
0
def generate(inputSPIpath, inputTestPath, outputVSMpath, confidence):

    # Checking output path
    projizz.checkPath(outputVSMpath)

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)

    # Collect not used keys
    # because using 5-fold CV
    t = 0
    result = []
    for filename in os.listdir(inputTestPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(mapper, (t, filename, inputTestPath)))
            t += 1
    pool.close()
    pool.join()

    notUsedKeys = []
    for r in result:
        ks = r.get()
        notUsedKeys += ks

    ### Build Model
    # Paatern Selection
    modelArticles = projizz.buildYagoProperties([])
    words = []
    count = 0
    for filename in os.listdir(inputSPIpath):
        if ".json" in filename:
            ptnId = filename[:-5]

            # ignore invalidate pattern
            if not projizz.isPatternValidate(
                    ptnId, table, confidence=confidence):
                continue

            count += 1
            print count, ptnId

            ptnInstance = projizz.jsonRead(os.path.join(
                inputSPIpath, filename))
            for rela in ptnInstance:
                for key in ptnInstance[rela]:
                    # ignore in testing data's key
                    if key in notUsedKeys:
                        continue

                    for line in ptnInstance[rela][key]:
                        modelArticles[rela].append(line)

            if count % 100 == 0:
                print "Read", count, "files"

    for relation in modelArticles:
        print relation
        projizz.jsonWrite(modelArticles[relation],
                          os.path.join(outputVSMpath, "%s.txt" % (relation)))
def main(inputPtnPath,outputPath):
    
    start_time = datetime.now()

    #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table")
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")

    properties = projizz.buildYagoProperties({})

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,copy.deepcopy(properties) )))
            t += 1
    pool.close()
    pool.join()

    statistics = {}

    counter = 0
    for res in result:
        r,s = res.get()
        counter += 1
        print "%d / %d ..." % (counter,len(result))

        for degree in r:
            if not degree in properties:
                properties[degree] = {}

            for ptnId in r[degree]:
                if not ptnId in properties[degree]:
                    properties[degree][ptnId] = {"occ":[],"sup":{}}

                # give up the code below for speed up
                continue

                for occId in r[degree][ptnId]["occ"]:
                    if not occId in properties[degree][ptnId]["occ"]:
                        properties[degree][ptnId]["occ"].append(occId)

                for rela in r[degree][ptnId]["sup"]:
                    if not rela in properties[degree][ptnId]["sup"]:
                        properties[degree][ptnId]["sup"][rela] = []
                    for supId in r[degree][ptnId]["sup"][rela]:
                        if not supId in properties[degree][ptnId]["sup"][rela]:
                            properties[degree][ptnId]["sup"][rela].append(supId)

        for aid in s:
            statistics[aid] = s[aid]

    json.dump(properties,open(outputPath,"w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)

    ptnNum = 0
    occDocs = []

    ptnDeg = {}

    for ptnId in table: 
        if table[ptnId]["used"]:
            if "eval" in table[ptnId] and not table[ptnId]["eval"]:
                continue

            if table[ptnId]["confidence"] > c_confidence:
                deg = len(table[ptnId]["relations"])
                if not deg in ptnDeg:
                    ptnDeg[deg] = 0

                ptnDeg[deg] += 1


    for degree in range(1,18):
        if not degree in properties:
            print "%d\t%d\t%d\t%d\t%d" % (degree,0,0,0,0)
        else:
            occ = 0
            occs = 0
            for a in statistics:
                ds = statistics[a]
                if degree in ds:
                    occ += 1
                for i in ds:
                    if i <= degree:
                        occs += 1
                        break

            print "%d\t%d\t%d\t%d\t%d" % (degree,ptnDeg[degree],len(properties[degree]),occ,occs)

    # Calculate the number of articles which has no pattern.
    noPtnCount = 0
    for a in statistics:
        ds = statistics[a]
        if len(ds) == 0:
            noPtnCount += 1

    print noPtnCount,len(statistics)