def main(treeModelPath, dataInputPath, resultOutPath, debug):

    # read model
    treeModel = readModel(treeModelPath)

    # create output dir
    if not os.path.isdir(resultOutPath):
        os.mkdir(resultOutPath)

    if debug:
        pool = multiprocessing.Pool(processes=1)
    else:
        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())

    print "Number of core: %d" % (multiprocessing.cpu_count())

    start_time = datetime.now()

    jobN = 0
    for filename in os.listdir(dataInputPath):
        if ".json" in filename:
            if debug:
                # debug model just test 1 file in 1 process
                # filterFiles(jobN,filename,treeModel,debug)
                pool.apply_async(filterFiles, (jobN, filename, dataInputPath, resultOutPath, treeModel, debug))
                break
            else:
                pool.apply_async(filterFiles, (jobN, filename, dataInputPath, resultOutPath, treeModel, debug))
            jobN += 1

    pool.close()
    pool.join()

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemple #2
0
def main(inputModel, inputPath, outputFileName):
    def trace(root):
        for terms in root:
            if terms == "_rls_":
                if len(root[terms]) > 1:
                    m_c = 0
                    m_r = ""
                    for rela in root[terms]:
                        if properties[rela] > m_c:
                            m_c = properties[rela]
                            m_r = rela
                    root[terms] = [m_r]
                    print root["_ptn_"], "->", root[terms]
                else:
                    pass
            elif terms == "_ptn_":
                pass
            else:
                trace(root[terms])

    properties = buildProperties("../naive_model/PbR/")
    treeModel = readModel(inputModel)
    #trace(treeModel)

    start_time = datetime.now()

    result = []
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    for filename in os.listdir(inputPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(
                pool.apply_async(findAnwser, (
                    t,
                    filename,
                    inputPath,
                    partAns,
                )))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m] += r[m]

    trace(treeModel)

    print "start write out to %s" % (outputFileName)
    json.dump(treeModel, open(outputFileName, "w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def overlapDetect():
    #print "Start to detect overlap between relationships"
    treeModel = readModel("./patternTree.json")

    overlap = {}
    patterns = {}
    for filename in os.listdir("../naive_model/PbR"):
        relation = filename[:-4]
        f = open(os.path.join("../naive_model/PbR", filename), "r")
        overlap[relation] = {}
        for line in f:
            words = line[:-2].lower().split()
            #print filename,words
            t = treeModel
            for word in words:
                t = t[word]

            for rls in t["_rls_"]:
                if not rls in overlap[relation]:
                    overlap[relation][rls] = 0
                overlap[relation][rls] += 1

            if not t["_ptn_"] in patterns:
                patterns[t["_ptn_"]] = []
            if relation not in patterns[t["_ptn_"]]:
                patterns[t["_ptn_"]].append(relation)

        #print relation,len(overlap[relation]),overlap[relation]

        f.close()

    #print "unique patten #:",len(patterns)

    if len(sys.argv) > 1:
        for relation in overlap:
            over = overlap[relation]
            over = sorted(over.items(), key=lambda x: x[1], reverse=True)
            print relation
            rela, total = over[0]
            for rela, count in over:
                p = 100 * float(count) / float(total)
                p2 = 100 * float(overlap[rela][relation]) / float(
                    overlap[rela][rela])
                if p > .0:
                    print "\t", rela, count, "(%.2f%% / %.2f%% (%d))" % (
                        p, p2, overlap[rela][rela])
                else:
                    break
    else:
        sortedPattern = sorted(patterns.items(),
                               key=lambda x: len(x[1]),
                               reverse=True)
        for ptn, rls in sortedPattern:
            print ptn, len(rls), rls
def overlapDetect():
    #print "Start to detect overlap between relationships"
    treeModel = readModel("./patternTree.json")

    overlap = {}
    patterns = {}
    for filename in os.listdir("../naive_model/PbR"):
        relation = filename[:-4]
        f = open(os.path.join("../naive_model/PbR",filename),"r")
        overlap[relation] = {}
        for line in f:
            words = line[:-2].lower().split()
            #print filename,words
            t = treeModel
            for word in words:
                t = t[word]

            for rls in t["_rls_"]:
                if not rls in overlap[relation]:
                    overlap[relation][rls] = 0
                overlap[relation][rls] += 1
            
            if not t["_ptn_"] in patterns:
                patterns[t["_ptn_"]] = []
            if relation not in patterns[t["_ptn_"]]:
                patterns[t["_ptn_"]].append(relation)

        #print relation,len(overlap[relation]),overlap[relation]

        f.close()

    #print "unique patten #:",len(patterns)

    if len(sys.argv) > 1:
        for relation in overlap:
            over = overlap[relation]
            over = sorted(over.items(), key=lambda x:x[1],reverse=True)
            print relation
            rela, total = over[0]
            for rela,count in over:
                p = 100*float(count)/float(total)
                p2 = 100*float(overlap[rela][relation])/float(overlap[rela][rela])
                if p > .0:
                    print "\t",rela,count,"(%.2f%% / %.2f%% (%d))" % (p,p2,overlap[rela][rela])
                else:
                    break
    else:
        sortedPattern = sorted(patterns.items(), key=lambda x:len(x[1]), reverse=True)
        for ptn,rls in sortedPattern:
            print ptn,len(rls),rls
def buildOverlapMatrix():
    treeModel = readModel("./patternTree.json")
    keys = []

    # overlapMatrix [ relaA ] [ relaB ]
    # means that relaA in relaB count (%?)

    overlapMatrix = {}
    for filename in os.listdir("../naive_model/PbR/"):
        keys.append(filename[:-4])
    for relation in keys:
        overlapMatrix[relation] = {}
        for rela in keys:
            overlapMatrix[relation][rela] = 0

    # read tree
    for ontology in overlapMatrix:
        f = open("../naive_model/PbR/%s.txt" % (ontology), "r")
        for pattern in f:
            words = pattern[:-2].lower().split()
            t = treeModel
            for word in words:
                t = t[word]

            for relaA in t["_rls_"]:
                overlapMatrix[ontology][relaA] += 1

        f.close()

    over = {}


    for ontology in overlapMatrix:
        thisCount = overlapMatrix[ontology][ontology]
        for otherOntology in overlapMatrix[ontology]:
            if otherOntology == ontology:
                continue
            otherCount = overlapMatrix[ontology][otherOntology]
            if thisCount == otherCount:
                if not ontology in over:
                    over[ontology] = []
                if not otherOntology in over[ontology]:
                    over[ontology].append(otherOntology)


    for ontology in over:
        for otherOntology in over[ontology]:
            if otherOntology in over and ontology in over[otherOntology]:
                print ontology, "<->", otherOntology
def main(inputModel, inputPath, outputFileName):
    def trace(root):
        for terms in root:
            if terms == "_rls_":
                if len(root[terms]) > 1:
                    m_c = 0
                    m_r = ""
                    for rela in root[terms]:
                        if properties[rela] > m_c:
                            m_c = properties[rela]
                            m_r = rela
                    root[terms] = [m_r]
                    print root["_ptn_"], "->", root[terms]
                else:
                    pass
            elif terms == "_ptn_":
                pass
            else:
                trace(root[terms])

    properties = buildProperties("../naive_model/PbR/")
    treeModel = readModel(inputModel)
    # trace(treeModel)

    start_time = datetime.now()

    result = []
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    t = 0
    for filename in os.listdir(inputPath):
        if ".json" in filename:
            partAns = copy.deepcopy(properties)
            result.append(pool.apply_async(findAnwser, (t, filename, inputPath, partAns)))
            t += 1
    pool.close()
    pool.join()

    for res in result:
        r = res.get()
        for m in r:
            properties[m] += r[m]

    trace(treeModel)

    print "start write out to %s" % (outputFileName)
    json.dump(treeModel, open(outputFileName, "w"))

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
Exemple #7
0
def buildOverlapMatrix():
    treeModel = readModel("./patternTree.json")
    keys = []

    # overlapMatrix [ relaA ] [ relaB ]
    # means that relaA in relaB count (%?)

    overlapMatrix = {}
    for filename in os.listdir("../naive_model/PbR/"):
        keys.append(filename[:-4])
    for relation in keys:
        overlapMatrix[relation] = {}
        for rela in keys:
            overlapMatrix[relation][rela] = 0

    # read tree
    for ontology in overlapMatrix:
        f = open("../naive_model/PbR/%s.txt" % (ontology), "r")
        for pattern in f:
            words = pattern[:-2].lower().split()
            t = treeModel
            for word in words:
                t = t[word]

            for relaA in t["_rls_"]:
                overlapMatrix[ontology][relaA] += 1

        f.close()

    over = {}

    for ontology in overlapMatrix:
        thisCount = overlapMatrix[ontology][ontology]
        for otherOntology in overlapMatrix[ontology]:
            if otherOntology == ontology:
                continue
            otherCount = overlapMatrix[ontology][otherOntology]
            if thisCount == otherCount:
                if not ontology in over:
                    over[ontology] = []
                if not otherOntology in over[ontology]:
                    over[ontology].append(otherOntology)

    for ontology in over:
        for otherOntology in over[ontology]:
            if otherOntology in over and ontology in over[otherOntology]:
                print ontology, "<->", otherOntology
def main(treeModelPath, dataInputPath, resultOutPath, debug):

    # read model
    treeModel = readModel(treeModelPath)

    # create output dir
    if not os.path.isdir(resultOutPath):
        os.mkdir(resultOutPath)

    if debug:
        pool = multiprocessing.Pool(processes=1)
    else:
        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())

    print "Number of core: %d" % (multiprocessing.cpu_count())

    start_time = datetime.now()

    jobN = 0
    for filename in os.listdir(dataInputPath):
        if ".json" in filename:
            if debug:
                # debug model just test 1 file in 1 process
                # filterFiles(jobN,filename,treeModel,debug)
                pool.apply_async(filterFiles,
                                 (jobN, filename, dataInputPath, resultOutPath,
                                  treeModel, debug))
                break
            else:
                pool.apply_async(filterFiles,
                                 (jobN, filename, dataInputPath, resultOutPath,
                                  treeModel, debug))
            jobN += 1

    pool.close()
    pool.join()

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)