import job_processing_functions as jpf ## const globals BASE_DIR = "/home/michael/science/wikipedia/code/" INI_FILE = BASE_DIR + "config_data.ini" ## parse the ini file ini_config = ConfigParser.ConfigParser() ini_config.readfp(open(INI_FILE)) ## init constants N = int(ini_config.get('constants', 'N')) K_REV = int(ini_config.get('constants', 'K_REVISIONS')) K_TIME = int(ini_config.get('constants', 'K_TIME')) ## init the DB connection = MySQLdb.connect(host=ini_config.get('db', 'host'), user=ini_config.get('db', 'user'), passwd=ini_config.get('db', 'passwd') \ , db=ini_config.get('db', 'db') ) curs = connection.cursor() if len(sys.argv) > 2: article_id = sys.argv[1] revision_id = sys.argv[2] job = (article_id, revision_id) jsn = jpf.get_treejson(curs, article_id) jpf.process_a_job(curs, job, jsn, N, K_REV, K_TIME) curs.close() connection.close()
def run(self): # parse the ini file ini_config = ConfigParser.ConfigParser() ini_config.readfp(open(self.inifile)) # init constants N = int(ini_config.get("constants", "N")) K_REV = int(ini_config.get("constants", "K_REVISIONS")) K_TIME = int(ini_config.get("constants", "K_TIME")) SLEEP_TIME = int(ini_config.get("constants", "SLEEP_TIME")) ## init the DB connection = MySQLdb.connect( host=ini_config.get("db", "host"), user=ini_config.get("db", "user"), passwd=ini_config.get("db", "passwd"), db=ini_config.get("db", "db"), ) connection.autocommit(True) curs = connection.cursor() # logic desription # get_all_jobs returns dictionary of jobs with key by # article id (page id), value is a list of revision id's; # the list is sorted in ascending order; # then we iterate over all articles and add articles in ascending order; # before adding a revision we check whether # previous revision was added to the tree or not; # if previous revision was not added then we add all revisions id's of # the article which have not been proceeded; # if previous revision was added then we add # current revision to the tree and delete job from the queue while True: jobs = jpf.get_all_jobs(curs) # if there are no jobs then we sleep for SLEEP_TIME seconds if jobs == None: time.sleep(SLEEP_TIME) continue # now processing all jobs for article_id in jobs: list_of_rev_id = jobs[article_id] for idx in xrange(len(list_of_rev_id)): revision_id = list_of_rev_id[idx] job = (article_id, revision_id) jsn = jpf.get_treejson(curs, article_id) # case when we don't have stored tree yet if jsn == None: # if revision is the first revision if jpf.get_previous_revision_id(curs, revision_id) == 0: jpf.process_a_job(curs, job, jsn, N, K_REV, K_TIME) continue # todo(michael): think about case when we want to add # revision which should have been added earlier then # the latest added revision if jpf.is_tree_uptodate(curs, jsn, article_id, revision_id): jpf.process_a_job(curs, job, jsn, N, K_REV, K_TIME) else: # fetch all unproceeded revisions # and add them to the queue new_ids = jpf.fetch_new_revisions(curs, jsn, article_id) missed_ids = [x for x in new_ids if not x in list_of_rev_id[idx:]] for missed_revision_id in missed_ids: jpf.put_job_in_queue(curs, article_id, missed_revision_id) # inserted jobs will be proceeded when # job_processor will read the job queue next time # now we need to proceed next article break curs.close() connection.close()