def main(): C = color.bcolors() print C.HEADER + "=========== Instantiate MapReduceFramework ===========" + C.ENDC mrf = framework.MapReduceFramework() mrf.getWorkerInfo('prework_workers.json') #print C.HEADER + "=========== Start Local Indexing ===========" + C.ENDC # localIndexer.ReviewIndexing() #mrf.mapReduceFS('tintest', 'mapreduce/test/fish_jobs', 'mapreduce.test.wordcount.mapper', 1, 'mapreduce.test.wordcount.reducer', 'mapreduce/test/fish_jobs/out') #tornado.ioloop.IOLoop.instance().start() print C.HEADER + "=========== Start Indexing Movies ===========" + C.ENDC print C.OKBLUE + "Start idfBuilder_test" + C.ENDC mrf.mapReduceFS('idfBuilder_test', 'mapreduce/input_movie_test', 'src.idfBuilder.mapper', 1, 'src.idfBuilder.reducer', 'constants/idf') tornado.ioloop.IOLoop.instance().start() jobTable = DisTable(tableName='idfBuilder_test') print type(jobTable) print jobTable.fetch_all() print 'works!!' print C.OKBLUE + "Start idfBuilder" + C.ENDC mrf.mapReduceFS('idfBuilder', 'constants/input_movie_test', 'src.idfBuilder.mapper', 1, 'src.idfBuilder.reducer', 'constants/idf') tornado.ioloop.IOLoop.instance().start() print C.OKBLUE + "Start invertedIndexer" + C.ENDC mrf.mapReduceFS('invertedIndexer', 'constants/input_movie', 'src.invertedIndexer.mapper', 3, 'src.invertedIndexer.reducer', 'constants/invertedIndex') tornado.ioloop.IOLoop.instance().start() print C.OKBLUE + "Start documentStore" + C.ENDC mrf.mapReduceFS('documentStore', 'constants/input_movie', 'src.documentStore.mapper', 3, 'src.documentStore.reducer', 'constants/documentStore') tornado.ioloop.IOLoop.instance().start() print C.HEADER + "=========== Start Indexing Genre ===========" + C.ENDC print C.OKBLUE + "Start genreIndexer" + C.ENDC mrf.mapReduceFS('genreIndexer', 'constants/input_movie', 'src.genreIndexer.mapper', 1, 'src.genreIndexer.reducer', 'constants/genreIndexer') tornado.ioloop.IOLoop.instance().start() print C.HEADER + "=========== Start Indexing Reviews ===========" + C.ENDC print C.OKBLUE + "Start movieIndexer" + C.ENDC mrf.mapReduceFS('movieIndexer', 'constants/input_review', 'src.movieIndexer.mapper', 3, 'src.movieIndexer.reducer', 'constants/movieIndexer') tornado.ioloop.IOLoop.instance().start() print C.OKBLUE + "Start reviewIndexer" + C.ENDC mrf.mapReduceFS('movieIndexer', 'constants/input_review', 'src.reviewIndexer.mapper', 3, 'src.reviewIndexer.reducer', 'constants/reviewIndexer') tornado.ioloop.IOLoop.instance().start() print C.HEADER + "=========== Start Classification Training ===========" + C.ENDC worker_address = 'prework_workers.json' #raw_data = 'constants/Genre_dict' raw_data = 'constants/Genre_dictII_9500' training_set = 'constants/training_set.p' weights_dir = 'constants/classification_weights' tn = trainer.Trainer() tn.setWorkerInfo(worker_address) genres = tn.processRawData(raw_data, training_set) tn.setTraningParameter(0.9, 500, 0.01) tn.train(training_set, genres, weights_dir) tornado.ioloop.IOLoop.instance().start() tn.generateWeightTable(weights_dir)
def main(): C = color.bcolors() print C.HEADER + "=========== Instantiate MapReduceFramework ===========" + C.ENDC mrf = framework.MapReduceFramework() mrf.getWorkerInfo('mapreduce_workers.json') #print C.HEADER + "=========== Start Local Indexing ===========" + C.ENDC # localIndexer.ReviewIndexing() print C.HEADER + "=========== Start Indexing Movies ===========" + C.ENDC print C.OKBLUE + "Start invertedIndexer" + C.ENDC mrf.mapReduce('constants/input_movie', 'src.invertedIndexer.mapper', 3, 'src.invertedIndexer.reducer', 'constants/invertedIndex') tornado.ioloop.IOLoop.instance().start() print C.OKBLUE + "Start idfBuilder" + C.ENDC mrf.mapReduce('constants/input_movie', 'src.idfBuilder.mapper', 1, 'src.idfBuilder.reducer', 'constants/idf') tornado.ioloop.IOLoop.instance().start() print C.OKBLUE + "Start documentStore" + C.ENDC mrf.mapReduce('constants/input_movie', 'src.documentStore.mapper', 3, 'src.documentStore.reducer', 'constants/documentStore') tornado.ioloop.IOLoop.instance().start() print C.HEADER + "=========== Start Indexing Genre ===========" + C.ENDC print C.OKBLUE + "Start genreIndexer" + C.ENDC mrf.mapReduce('constants/input_movie', 'src.genreIndexer.mapper', 1, 'src.genreIndexer.reducer', 'constants/genreIndexer') tornado.ioloop.IOLoop.instance().start() print C.HEADER + "=========== Start Indexing Reviews ===========" + C.ENDC print C.OKBLUE + "Start movieIndexer" + C.ENDC mrf.mapReduce('constants/input_review', 'src.movieIndexer.mapper', 3, 'src.movieIndexer.reducer', 'constants/movieIndexer') tornado.ioloop.IOLoop.instance().start() print C.OKBLUE + "Start reviewIndexer" + C.ENDC mrf.mapReduce('constants/input_review', 'src.reviewIndexer.mapper', 3, 'src.reviewIndexer.reducer', 'constants/reviewIndexer') tornado.ioloop.IOLoop.instance().start() print C.HEADER + "=========== Start Classification Training ===========" + C.ENDC worker_address = 'classification_workers.json' raw_data = 'constants/Genre_dict' raw_data = 'constants/Genre_dictII_9500' training_set = 'constants/training_set.p' weights_dir = 'constants/classification_weights' tn = trainer.Trainer() tn.setWorkerInfo(worker_address) genres = tn.processRawData(raw_data, training_set) tn.setTraningParameter(0.9, 500, 0.01) tn.train(training_set, genres, weights_dir) tornado.ioloop.IOLoop.instance().start() tn.generateWeightTable(weights_dir)
import urllib from nltk.tokenize import RegexpTokenizer from tornado.httpclient import AsyncHTTPClient from tornado import gen from tornado.options import define, options from operator import mul invertedIndex = None tokenizer = None IDF_Index = None from src import color bcolors = color.bcolors() def Tosnippet(text, keywords, extend): returnText= '...' for keyword in keywords: loc= 0 tmp= 0 loc = text.lower().find(keyword.lower(), loc) toReplace = text[int(loc):int(loc)+int(len(keyword))] text= text.replace(toReplace, '<strong>{}</strong>'.format(toReplace)) + "..." loc= 0 tmp= 0 while (loc<len(text) and loc!=-1): tmp = loc loc = text.lower().find(keyword.lower(), loc)
def main(): from recommendation import recom_worker, recom_front from searchEngine.backend import back as searchEng_worker from searchEngine.frontend import front as searchEng_front from classification.backend import online as classifier # from recommendation import searchEng_worker, searchEng_front import mapreduce.framework as framework from src import color # from src import tomatoCrawler as TC C = color.bcolors() global masterServer, MovieServer, ReviewServer, IdxServer, DocServer, Baseport print C.HEADER + "=========== Start Crawling ===========" + C.ENDC # TC.main2Genre() print C.HEADER + "=========== Find Available Ports ===========" + C.ENDC getPorts() print C.OKBLUE + "SuperFront:\t" + str(SuperFront) + C.ENDC print C.OKBLUE + "masterServer:\t" + str(masterServer) + C.ENDC print C.OKBLUE + "MovieServer:\t" + str(MovieServer) + C.ENDC print C.OKBLUE + "ReviewServer:\t" + str(ReviewServer) + C.ENDC print C.OKBLUE + "IdxServer:\t" + str(IdxServer) + C.ENDC print C.OKBLUE + "DocServer:\t" + str(DocServer) + C.ENDC print C.OKBLUE + "ClassifierServer:\t" + str(ClassifierServer) + C.ENDC print C.HEADER + "=========== Fire Up All Servers ===========" + C.ENDC uid = fork_processes(NumMaster+NumMovie+NumReview+NumIdx+NumDoc) if uid == 0: sockets = bind_sockets(masterServer[uid].split(':')[-1]) myfront = recom_front.FrontEndApp(MovieServer, ReviewServer) server = myfront.app elif uid ==1: sockets = bind_sockets(masterServer[uid].split(':')[-1]) myfront = searchEng_front.FrontEndApp(IdxServer, DocServer) server = myfront.app elif uid ==2: sockets = bind_sockets(masterServer[uid].split(':')[-1]) myClasify = classifier.Application(([(r"/predict?", classifier.PredictionHandler)])) myClasify.setGenres("./constants/classification_weights/genres.p") myClasify.setWeights("./constants/classification_weights/big_weight.p") server = tornado.httpserver.HTTPServer(myClasify ) elif uid < NumMaster + NumMovie: myIdx = uid - NumMaster sockets = bind_sockets(MovieServer[myIdx].split(':')[-1]) myback_movie = recom_worker.RecommApp('MovieServer', myIdx, MovieServer[myIdx].split(':')[-1]) server = myback_movie.app elif uid < NumMaster + NumMovie + NumReview: myIdx = uid - NumMovie - NumMaster sockets = bind_sockets(ReviewServer[myIdx].split(':')[-1]) myback_review = recom_worker.RecommApp('ReviewServer', myIdx, ReviewServer[myIdx].split(':')[-1]) server = myback_review.app elif uid < NumMaster + NumMovie + NumReview + NumIdx: myIdx = uid-NumMovie-NumReview-NumMaster sockets = bind_sockets(IdxServer[myIdx].split(':')[-1]) myback_idx = searchEng_worker.BackEndApp('IndexServer', myIdx, IdxServer[myIdx].split(':')[-1]) server = myback_idx.app elif uid < NumMaster + NumMovie + NumReview + NumIdx + NumDoc: myIdx = uid-NumMovie-NumReview-NumIdx-NumMaster sockets = bind_sockets(DocServer[myIdx].split(':')[-1]) myback_doc = searchEng_worker.BackEndApp('DocServer', myIdx, DocServer[myIdx].split(':')[-1]) server = myback_doc.app server.add_sockets(sockets) tornado.ioloop.IOLoop.instance().start()
import tornado.httpserver import tornado.ioloop import tornado.web import hashlib import socket import getpass import json, pickle from tornado.httpclient import AsyncHTTPClient from tornado import gen from tornado.options import define, options from src import color bcolors = color.bcolors() # ## collecting four available ports ports = [] ports_index = [] ports_Doc = [] genere_dict = {} def remove_duplicates(mylist): output = [] seen = set() for (movieID, value) in mylist: # If value has not been encountered yet, # ... add it to both list and set. if movieID not in seen: output.append((movieID, value))