Ejemplo n.º 1
0
def preprocess(inputPath, inputPtnPath, outputPath):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(
                    mapper,
                    (t, filename, inputPath, inputPtnPath, model, table)))
            t += 1
    pool.close()
    pool.join()

    patternInstances = {}

    # Reducer
    for r in result:
        sibf = r.get()
        for key in sibf:
            for ptnId in sibf[key]:
                if not ptnId in patternInstances:
                    patternInstances[ptnId] = {}
                for rela in sibf[key][ptnId]:
                    for inst in sibf[key][ptnId][rela]:
                        if not rela in patternInstances[ptnId]:
                            patternInstances[ptnId][rela] = {}
                        if not key in patternInstances[ptnId][rela]:
                            patternInstances[ptnId][rela][key] = []
                        patternInstances[ptnId][rela][key].append(inst)

    # Write to files
    # NOTE
    # Output Format:
    # ptnId.json (json)
    # rela: keys
    #   key: line text
    for ptnId in patternInstances:
        projizz.jsonWrite(patternInstances[ptnId],
                          os.path.join(outputPath, "%s.json" % (ptnId)))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 2
0
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath,
         outputFilename):

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp": [], "fp": [], "fn": []})
    domainRange = projizz.getYagoRelationDomainRange()
    idf, docs, lens = projizz.getVSMmodels(vsmPath)
    st = projizz.getSortedPatternStatistic(projizz.jsonRead(psfile))
    vsmData = (idf, docs, lens)

    projizz.checkPath(outputPath)

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(
                pool.apply_async(
                    mapper, (t, filename, inputPath, inputPtnPath, table, st,
                             partAns, domainRange, confidence, vsmData)))
            #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData  ))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]

    for keyname in expResult:
        p = expResult[keyname]
        keydirName = "vsm-%d" % (keyname)
        projizz.checkPath(os.path.join(outputPath, keydirName))
        projizz.jsonWrite(p,
                          os.path.join(outputPath, keydirName, outputFilename))
        print "start write out to %s" % (os.path.join(outputPath, keydirName))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 3
0
def preprocess(inputPath,inputPtnPath,outputPath):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputPath,inputPtnPath, model, table))  )
            t += 1
    pool.close()
    pool.join()

    patternInstances = {}

    # Reducer
    for r in result:
        sibf = r.get()
        for key in sibf:
            for ptnId in sibf[key]:
                if not ptnId in patternInstances:
                    patternInstances[ptnId] = {}
                for rela in sibf[key][ptnId]:
                    for inst in sibf[key][ptnId][rela]:
                        if not rela in patternInstances[ptnId]:
                            patternInstances[ptnId][rela] = {}
                        if not key in patternInstances[ptnId][rela]:
                            patternInstances[ptnId][rela][key] = []
                        patternInstances[ptnId][rela][key].append(inst)

    
    # Write to files
    # NOTE
    # Output Format:
    # ptnId.json (json)
    # rela: keys
    #   key: line text
    for ptnId in patternInstances:
        projizz.jsonWrite(patternInstances[ptnId],os.path.join(outputPath,"%s.json" % (ptnId))) 

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 4
0
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): 
    
    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table")
    properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[]})
    domainRange = projizz.getYagoRelationDomainRange()
    idf,docs,lens = projizz.getVSMmodels(vsmPath)
    st = projizz.getSortedPatternStatistic( projizz.jsonRead(psfile) )
    vsmData = (idf, docs, lens)

    projizz.checkPath(outputPath)

    start_time = datetime.now()

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(mapper, ( t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData  )))
            #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData  ))
            t += 1
    pool.close()
    pool.join()

    expResult = {}
    for res in result:
        r = res.get()
        for keyname in r:

            if not keyname in expResult:
                expResult[keyname] = copy.deepcopy(properties)

            for m in r[keyname]:
                if m == "produced":
                    continue
                expResult[keyname][m]["tp"] += r[keyname][m]["tp"]
                expResult[keyname][m]["fp"] += r[keyname][m]["fp"]
                expResult[keyname][m]["fn"] += r[keyname][m]["fn"]


    for keyname in expResult:
        p = expResult[keyname]
        keydirName = "vsm-%d" % (keyname)
        projizz.checkPath( os.path.join(outputPath,keydirName))
        projizz.jsonWrite(p,os.path.join(outputPath,keydirName,outputFilename))
        print "start write out to %s" % (os.path.join(outputPath,keydirName))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 5
0
def buildModels(inputpath, outputPath):

    projizz.checkPath(outputPath)

    cpuCount = multiprocessing.cpu_count()
    if cpuCount > 8:
        cpuCount = 8

    pool = multiprocessing.Pool(processes=cpuCount)
    t = 0

    relations = projizz.getYagoRelation()
    for relation in relations:
        if relation == "produced":
            continue
        pool.apply_async(trainModel, (t, relation, inputpath, outputPath))
        t += 1
    pool.close()
    pool.join()

    print "Done training all classifiers"
Ejemplo n.º 6
0
def buildModels(inputpath,outputPath):

    projizz.checkPath(outputPath)

    cpuCount = multiprocessing.cpu_count()
    if cpuCount > 8:
        cpuCount = 8

    pool = multiprocessing.Pool(processes=cpuCount) 
    t = 0

    relations = projizz.getYagoRelation()
    for relation in relations:
        if relation == "produced":
            continue
        pool.apply_async(trainModel, (t,relation,inputpath,outputPath))
        t += 1
    pool.close()
    pool.join()

    print "Done training all classifiers"
Ejemplo n.º 7
0
def preprocess(inputPath, topN, outputPath):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(
                    mapper,
                    (t, filename, inputPath, topN, outputPath, model, table)))
            t += 1
    pool.close()
    pool.join()

    words = {}
    idf = {}
    tfs = {}

    # Reducer - DF
    types = 0
    for r in result:
        fn, tks = r.get()
        tfs[fn] = tks
        types += 1

        for t in tks:
            if t not in words:
                words[t] = 0
            words[t] += 1

    print "Doc#", types, "words#", len(words)

    projizz.jsonWrite(words, os.path.join(outputPath, "documentFreq.df"))

    # Calculate idf
    for w in words:
        if words[w] == 0:
            continue

        idf[w] = math.log(float(types) / float(words[w]), 10)

    projizz.jsonWrite(idf, os.path.join(outputPath, "idf.idf"))
    print "Write out idf file"

    # Calculate td-idf weight
    for fn in tfs:
        tks = tfs[fn]
        weight = {}
        for t in tks:
            tf = tks[t]
            if t not in idf:
                continue

            weight[t] = tf * idf[t]

        projizz.jsonWrite(weight, os.path.join(outputPath, fn))
        print "build", fn, "tf-idf weight"

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 8
0
def preprocess(inputPath, inputPtnPath, outputPath, confidence):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(mapper, (t, filename, inputPath, inputPtnPath,
                                          model, table, confidence)))
            #result.append( mapper(t,filename,inputPath,inputPtnPath, model, table, confidence))
            t += 1
    pool.close()
    pool.join()

    modelArticles = {}
    negAritcles = {}

    POSArticles = {}
    NEGArticles = {}

    # Reducer
    for r in result:
        sibr, osibr, p, n = r.get()

        for rela in sibr:
            if not rela in modelArticles:
                modelArticles[rela] = []
            modelArticles[rela] += sibr[rela]

        for rela in osibr:
            if not rela in negAritcles:
                negAritcles[rela] = []
            negAritcles[rela] += osibr[rela]

        for rela in p:
            if not rela in POSArticles:
                POSArticles[rela] = []
            POSArticles[rela] += p[rela]

        for rela in n:
            if not rela in NEGArticles:
                NEGArticles[rela] = []
            NEGArticles[rela] += n[rela]

    #
    #   relation.json: [line, line, line, ....]
    #

    for rela in modelArticles:
        print rela
        projizz.jsonWrite(modelArticles[rela],
                          os.path.join(outputPath, "%s.json" % (rela)))

    for rela in negAritcles:
        print rela
        projizz.jsonWrite(negAritcles[rela],
                          os.path.join(outputPath, "%s.other" % (rela)))

    for rela in POSArticles:
        print rela
        projizz.jsonWrite(POSArticles[rela],
                          os.path.join(outputPath, "%s.pos" % (rela)))

    for rela in NEGArticles:
        print rela
        projizz.jsonWrite(NEGArticles[rela],
                          os.path.join(outputPath, "%s.neg" % (rela)))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 9
0
def generate(inputSPIpath, inputTestPath, outputVSMpath, confidence):

    # Checking output path
    projizz.checkPath(outputVSMpath)

    model, table = projizz.readPrefixTreeModelWithTable(
        "../yago/yagoPatternTree.model",
        "../patty/yagoPatternTreeWithConfidence.table")

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)

    # Collect not used keys
    # because using 5-fold CV
    t = 0
    result = []
    for filename in os.listdir(inputTestPath):
        if ".json" in filename:
            result.append(
                pool.apply_async(mapper, (t, filename, inputTestPath)))
            t += 1
    pool.close()
    pool.join()

    notUsedKeys = []
    for r in result:
        ks = r.get()
        notUsedKeys += ks

    ### Build Model
    # Paatern Selection
    modelArticles = projizz.buildYagoProperties([])
    words = []
    count = 0
    for filename in os.listdir(inputSPIpath):
        if ".json" in filename:
            ptnId = filename[:-5]

            # ignore invalidate pattern
            if not projizz.isPatternValidate(
                    ptnId, table, confidence=confidence):
                continue

            count += 1
            print count, ptnId

            ptnInstance = projizz.jsonRead(os.path.join(
                inputSPIpath, filename))
            for rela in ptnInstance:
                for key in ptnInstance[rela]:
                    # ignore in testing data's key
                    if key in notUsedKeys:
                        continue

                    for line in ptnInstance[rela][key]:
                        modelArticles[rela].append(line)

            if count % 100 == 0:
                print "Read", count, "files"

    for relation in modelArticles:
        print relation
        projizz.jsonWrite(modelArticles[relation],
                          os.path.join(outputVSMpath, "%s.txt" % (relation)))
Ejemplo n.º 10
0
def generate(inputSPIpath,inputTestPath,outputVSMpath,confidence):
    
    # Checking output path
    projizz.checkPath(outputVSMpath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)

    # Collect not used keys
    # because using 5-fold CV
    t = 0
    result = []
    for filename in os.listdir(inputTestPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputTestPath) )  )
            t += 1
    pool.close()
    pool.join()

    notUsedKeys = []
    for r in result:
        ks = r.get()
        notUsedKeys += ks

    ### Build Model
    # Paatern Selection
    modelArticles = projizz.buildYagoProperties([])
    words = []
    count = 0
    for filename in os.listdir(inputSPIpath):
        if ".json" in filename:
            ptnId = filename[:-5]

            # ignore invalidate pattern
            if not projizz.isPatternValidate(ptnId, table, confidence=confidence):
                continue

            count += 1
            print count,ptnId

            ptnInstance = projizz.jsonRead( os.path.join(inputSPIpath,filename) )
            for rela in ptnInstance:
                for key in ptnInstance[rela]:
                    # ignore in testing data's key
                    if key in notUsedKeys:
                        continue

                    for line in ptnInstance[rela][key]:
                        modelArticles[rela].append(line)
    
            if count%100 == 0:
                print "Read",count,"files"

    for relation in modelArticles:
        print relation
        projizz.jsonWrite(modelArticles[relation],os.path.join(outputVSMpath,"%s.txt" % (relation)))
Ejemplo n.º 11
0
def preprocess(inputPath,outputPath):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputPath,outputPath, model, table))  )
            t += 1
    pool.close()
    pool.join()

    words = {}
    idf = {}
    tfs = {}

    # Reducer - DF
    types = 0
    for r in result:
        fn,tks = r.get()
        tfs[fn] = tks
        types += 1

        for t in tks:
            if t not in words:
                words[t] = 0
            words[t] += 1

    print "Doc#",types,"words#",len(words)

    projizz.jsonWrite(words,os.path.join(outputPath,"documentFreq.df"))

    # Calculate idf
    for w in words:
        if words[w] == 0:
            continue

        idf[w] = math.log(float(types)/float(words[w]),10)
    
    projizz.jsonWrite(idf,os.path.join(outputPath,"idf.idf"))
    print "Write out idf file"

    # Calculate td-idf weight
    for fn in tfs:
        tks = tfs[fn]
        weight = {}
        for t in tks:
            tf = tks[t]
            if t not in idf:
                continue
            
            weight[t] = tf * idf[t]

        projizz.jsonWrite(weight,os.path.join(outputPath,fn))
        print "build",fn,"tf-idf weight"


    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Ejemplo n.º 12
0
def preprocess(inputPath,inputPtnPath,outputPath,confidence):

    # Checking output path
    projizz.checkPath(outputPath)

    model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table")

    start_time = datetime.now()

    # Processes pool
    proceessorNumber = multiprocessing.cpu_count()
    if proceessorNumber > 20:
        proceessorNumber = 20
    pool = multiprocessing.Pool(processes=proceessorNumber)
    t = 0
    result = []
    for filename in os.listdir(inputPtnPath):
        if ".json" in filename:
            result.append( pool.apply_async( mapper, (t,filename,inputPath,inputPtnPath, model, table, confidence))  )
            #result.append( mapper(t,filename,inputPath,inputPtnPath, model, table, confidence))
            t += 1
    pool.close()
    pool.join()

    modelArticles = {}
    negAritcles = {}

    POSArticles = {}
    NEGArticles = {}

    # Reducer
    for r in result:
        sibr, osibr, p, n = r.get()

        for rela in sibr:
            if not rela in modelArticles:
                modelArticles[rela] = []
            modelArticles[rela] += sibr[rela]

        for rela in osibr:
            if not rela in negAritcles:
                negAritcles[rela] = []
            negAritcles[rela] += osibr[rela]

        for rela in p:
            if not rela in POSArticles:
                POSArticles[rela] = []
            POSArticles[rela] += p[rela]

        for rela in n:
            if not rela in NEGArticles:
                NEGArticles[rela] = []
            NEGArticles[rela] += n[rela]

    #
    #   relation.json: [line, line, line, ....]
    #

    for rela in modelArticles:
        print rela
        projizz.jsonWrite(modelArticles[rela],os.path.join(outputPath,"%s.json" % (rela))) 

    for rela in negAritcles:
        print rela
        projizz.jsonWrite(negAritcles[rela],os.path.join(outputPath,"%s.other" % (rela))) 

    for rela in POSArticles:
        print rela
        projizz.jsonWrite(POSArticles[rela],os.path.join(outputPath,"%s.pos" % (rela)))
        
    for rela in NEGArticles:
        print rela
        projizz.jsonWrite(NEGArticles[rela],os.path.join(outputPath,"%s.neg" % (rela)))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)