def main(dataInputPath,resultOutPath,ptnOutputPath): model, table = projizz.readPrefixTreeModel("../prefix_tree_model/patternTree.json") if not os.path.isdir(resultOutPath): os.mkdir(resultOutPath) if not os.path.isdir(ptnOutputPath): os.mkdir(ptnOutputPath) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) print "Number of core: %d" % (multiprocessing.cpu_count()) start_time = datetime.now() jobN = 0 for filename in os.listdir(dataInputPath): if ".json" in filename: pool.apply_async(tryToFindRela, (jobN, filename, dataInputPath, resultOutPath,ptnOutputPath, model, table)) jobN+=1 pool.close() pool.join() diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds,diff.microseconds) projizz.combinedFileWriter(model,os.path.join(ptnOutputPath,"model")) projizz.combinedFileWriter(table,os.path.join(ptnOutputPath,"table"))
def testing(filename): content = projizz.combinedFileReader(filename) model, table = projizz.readPrefixTreeModel("./../prefix_tree_model/patternTree.json") start_time = datetime.now() for articleName in content: print articleName article = projizz.articleSimpleSentenceFileter(content[articleName]) for line in article: tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens,model) if len(patternExtracted)>0: print line.encode("utf-8") for ptnId,start,to in patternExtracted: print "\t[%d] %s" % (ptnId,table[ptnId]["pattern"]) print "\n----" diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds,diff.microseconds)
def testing(filename): content = projizz.combinedFileReader(filename) model, table = projizz.readPrefixTreeModel( "./../prefix_tree_model/patternTree.json") start_time = datetime.now() for articleName in content: print articleName article = projizz.articleSimpleSentenceFileter(content[articleName]) for line in article: tokens = projizz._posTagger.tag(line) patternExtracted = projizz.naiveExtractPatterns(tokens, model) if len(patternExtracted) > 0: print line.encode("utf-8") for ptnId, start, to in patternExtracted: print "\t[%d] %s" % (ptnId, table[ptnId]["pattern"]) print "\n----" diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)