Example #1
0
def main(dataInputPath,resultOutPath,ptnOutputPath):

    model, table = projizz.readPrefixTreeModel("../prefix_tree_model/patternTree.json")

    if not os.path.isdir(resultOutPath):
        os.mkdir(resultOutPath)

    if not os.path.isdir(ptnOutputPath):
        os.mkdir(ptnOutputPath)

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    print "Number of core: %d" % (multiprocessing.cpu_count())
    start_time = datetime.now()
    
    jobN = 0
    for filename in os.listdir(dataInputPath):
        if ".json" in filename:
            pool.apply_async(tryToFindRela, (jobN, filename, dataInputPath, resultOutPath,ptnOutputPath, model, table))
            jobN+=1

    pool.close()
    pool.join()

    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds,diff.microseconds)

    projizz.combinedFileWriter(model,os.path.join(ptnOutputPath,"model"))
    projizz.combinedFileWriter(table,os.path.join(ptnOutputPath,"table"))
def testing(filename):
    
    content = projizz.combinedFileReader(filename)

    model, table = projizz.readPrefixTreeModel("./../prefix_tree_model/patternTree.json")
  
    start_time = datetime.now()
    for articleName in content:
        print articleName
        article = projizz.articleSimpleSentenceFileter(content[articleName])
        
        for line in article:
            tokens = projizz._posTagger.tag(line)
            patternExtracted = projizz.naiveExtractPatterns(tokens,model)
            if len(patternExtracted)>0:
                print line.encode("utf-8")
                for ptnId,start,to in patternExtracted:
                    print "\t[%d] %s" % (ptnId,table[ptnId]["pattern"])

        print "\n----"
    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds,diff.microseconds)
def testing(filename):

    content = projizz.combinedFileReader(filename)

    model, table = projizz.readPrefixTreeModel(
        "./../prefix_tree_model/patternTree.json")

    start_time = datetime.now()
    for articleName in content:
        print articleName
        article = projizz.articleSimpleSentenceFileter(content[articleName])

        for line in article:
            tokens = projizz._posTagger.tag(line)
            patternExtracted = projizz.naiveExtractPatterns(tokens, model)
            if len(patternExtracted) > 0:
                print line.encode("utf-8")
                for ptnId, start, to in patternExtracted:
                    print "\t[%d] %s" % (ptnId, table[ptnId]["pattern"])

        print "\n----"
    diff = datetime.now() - start_time
    print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)