def publish(records,url=psettings.RABBITMQ_URL,exchange='MergerPipelineExchange',routing_key='FindNewRecordsRoute'): #Its ok that we create/tear down this connection many times within this script; it is not a bottleneck #and likely slightly increases stability of the workflow w = RabbitMQWorker() w.connect(psettings.RABBITMQ_URL) w.channel.basic_publish('MergerPipelineExchange','FindNewRecordsRoute',json.dumps(records)) w.connection.close()
def publish(bibcodes,url=psettings.RABBITMQ_URL,exchange='MergerPipelineExchange',routing_key='SolrUpdateRoute'): w = RabbitMQWorker() w.connect(psettings.RABBITMQ_URL) payload = json.dumps(bibcodes) w.channel.basic_publish(exchange,routing_key,payload) w.connection.close()
def connect_publisher(self): """ Makes a connection between the worker and the RabbitMQ instance, and sets up an attribute as a channel. :return: no return """ self.publish_worker = RabbitMQWorker() self.ret_queue = self.publish_worker.connect(psettings.RABBITMQ_URL)
def publish(records,max_queue_size=30,url=psettings.RABBITMQ_URL,exchange='MergerPipelineExchange',routing_key='FindNewRecordsRoute',LOGGER=LOGGER): #Its ok that we create/tear down this connection many times within this script; it is not a bottleneck #and likely slightly increases stability of the workflow w = RabbitMQWorker() w.connect(psettings.RABBITMQ_URL) #Hold onto the message if publishing it would cause the number of queued messages to exceed max_queue_size responses = [w.channel.queue_declare(queue=i,passive=True) for i in ['UpdateRecordsQueue','ReadRecordsQueue']] while any([r.method.message_count >= max_queue_size for r in responses]): LOGGER.debug(">%s messages in the relevant queue(s). I will wait 15s while they get consumed." % max_queue_size) time.sleep(15) responses = [w.channel.queue_declare(queue=i,passive=True) for i in ['UpdateRecordsQueue','ReadRecordsQueue']] payload = json.dumps(records) w.channel.basic_publish('MergerPipelineExchange','FindNewRecordsRoute',payload) LOGGER.debug("Published payload with hash: %s" % hash(payload)) w.connection.close()
class TestGeneric(unittest.TestCase): """ Generic test class. Used as the primary class that implements a standard integration test. Also contains a range of helper functions, and the correct tearDown method when interacting with RabbitMQ. """ def setUp(self): """ Sets up the parameters for the RabbitMQ workers, and also the workers themselves. Generates all the queues that should be in place for testing the RabbitMQ workers. :return: no return """ # Build the link files build_links(test_name='integration') # Load the extraction worker check_params = psettings.WORKERS['CheckIfExtractWorker'] standard_params = psettings.WORKERS['StandardFileExtractWorker'] writer_params = psettings.WORKERS['WriteMetaFileWorker'] error_params = psettings.WORKERS['ErrorHandlerWorker'] proxy_params = psettings.WORKERS['ProxyPublishWorker'] for params in [check_params, standard_params, writer_params, error_params, proxy_params]: params['RABBITMQ_URL'] = psettings.RABBITMQ_URL params['ERROR_HANDLER'] = psettings.ERROR_HANDLER params['extract_key'] = 'FULLTEXT_EXTRACT_PATH_UNITTEST' params['TEST_RUN'] = True params['PDF_EXTRACTOR'] = psettings.PDF_EXTRACTOR params['PROXY_PUBLISH'] = psettings.PROXY_PUBLISH self.params = params self.check_worker = CheckIfExtractWorker(params=check_params) self.standard_worker = StandardFileExtractWorker(params=standard_params) self.standard_worker.logger.debug('params: {0}'.format(standard_params)) self.meta_writer = WriteMetaFileWorker(params=writer_params) self.error_worker = ErrorHandlerWorker(params=error_params) self.proxy_worker = ProxyPublishWorker(params=proxy_params) self.meta_path = '' self.channel_list = None # Queues and routes are switched on so that they can allow workers # to connect TM = TaskMaster(psettings.RABBITMQ_URL, psettings.RABBITMQ_ROUTES, psettings.WORKERS) TM.initialize_rabbitmq() self.connect_publisher() self.purge_all_queues() def connect_publisher(self): """ Makes a connection between the worker and the RabbitMQ instance, and sets up an attribute as a channel. :return: no return """ self.publish_worker = RabbitMQWorker() self.ret_queue = self.publish_worker.connect(psettings.RABBITMQ_URL) def purge_all_queues(self): """ Purges all the content from all the queues existing in psettings.py. :return: no return """ for queue in psettings.RABBITMQ_ROUTES['QUEUES']: _q = queue['queue'] self.publish_worker.channel.queue_purge(queue=_q) def tearDown(self): """ General tearDown of the class. Purges the queues and then sleeps so that there is no contaminating the next set of tests. :return: no return """ self.purge_all_queues() time.sleep(5) def helper_get_details(self, test_publish): """ Generates a bunch of relevant information about the stub data being used. The attribute names should be relevant. :param test_publish: the file to the test stub file :return: no return """ with open(os.path.join(PROJ_HOME, test_publish), "r") as f: lines = f.readlines() self.nor = len(lines) self.bibcode, self.ft_source, self.provider = \ lines[0].strip().split('\t') self.bibcode_list = [i.strip().split('\t')[0] for i in lines] self.test_expected = check_if_extract.create_meta_path( {'bibcode': self.bibcode}, extract_key='FULLTEXT_EXTRACT_PATH_UNITTEST' ) self.meta_list = \ [check_if_extract.create_meta_path( {"bibcode": j}, extract_key='FULLTEXT_EXTRACT_PATH_UNITTEST' ).replace('meta.json', '') for j in self.bibcode_list] self.meta_path = self.test_expected.replace('meta.json', '') self.number_of_PDFs = len( list( filter(lambda x: x.lower().endswith('.pdf'), [i.strip().split("\t")[-2] for i in lines]) ) ) self.number_of_standard_files = self.nor - self.number_of_PDFs def calculate_expected_folders(self, full_text_links): """ Determines the paths that should exist if the test data was extracted. :param full_text_links: file that contains the full text links stub data :return: list of expected paths that would be created when the full text was extracted """ with open(os.path.join(PROJ_HOME, full_text_links), "r") as inf: lines = inf.readlines() expected_paths = \ [check_if_extract.create_meta_path( {CONSTANTS['BIBCODE']: line.strip().split('\t')[0]}, extract_key='FULLTEXT_EXTRACT_PATH_UNITTEST' ).replace('meta.json', '') for line in lines] return expected_paths def clean_up_path(self, paths): """ Takes the path given and deletes any content that should have been created when the full text was extracted. :param paths: list of paths to clean up their content :return: no return """ for path in paths: if os.path.exists(path): meta = os.path.join(path, 'meta.json') fulltext = os.path.join(path, 'fulltext.txt') dataset = os.path.join(path, 'dataset.txt') acknowledgements = os.path.join(path, 'acknowledgements.txt') file_list = [meta, fulltext, dataset, acknowledgements] for file_ in file_list: if os.path.exists(file_): os.remove(file_) os.rmdir(path) print('deleted: {0} and its content'.format(path)) else: print('Could not delete {0}, does not exist'.format(path))
class TestGeneric(unittest.TestCase): """ Generic test class. Used as the primary class that implements a standard integration test. Also contains a range of helper functions, and the correct tearDown method when interacting with RabbitMQ. """ def setUp(self): """ Sets up the parameters for the RabbitMQ workers, and also the workers themselves. Generates all the queues that should be in place for testing the RabbitMQ workers. :return: no return """ # Build the link files build_links(test_name='integration') # Load the extraction worker check_params = psettings.WORKERS['CheckIfExtractWorker'] standard_params = psettings.WORKERS['StandardFileExtractWorker'] writer_params = psettings.WORKERS['WriteMetaFileWorker'] error_params = psettings.WORKERS['ErrorHandlerWorker'] proxy_params = psettings.WORKERS['ProxyPublishWorker'] for params in [ check_params, standard_params, writer_params, error_params, proxy_params ]: params['RABBITMQ_URL'] = psettings.RABBITMQ_URL params['ERROR_HANDLER'] = psettings.ERROR_HANDLER params['extract_key'] = 'FULLTEXT_EXTRACT_PATH_UNITTEST' params['TEST_RUN'] = True params['PDF_EXTRACTOR'] = psettings.PDF_EXTRACTOR params['PROXY_PUBLISH'] = psettings.PROXY_PUBLISH self.params = params self.check_worker = CheckIfExtractWorker(params=check_params) self.standard_worker = StandardFileExtractWorker( params=standard_params) self.standard_worker.logger.debug( 'params: {0}'.format(standard_params)) self.meta_writer = WriteMetaFileWorker(params=writer_params) self.error_worker = ErrorHandlerWorker(params=error_params) self.proxy_worker = ProxyPublishWorker(params=proxy_params) self.meta_path = '' self.channel_list = None # Queues and routes are switched on so that they can allow workers # to connect TM = TaskMaster(psettings.RABBITMQ_URL, psettings.RABBITMQ_ROUTES, psettings.WORKERS) TM.initialize_rabbitmq() self.connect_publisher() self.purge_all_queues() def connect_publisher(self): """ Makes a connection between the worker and the RabbitMQ instance, and sets up an attribute as a channel. :return: no return """ self.publish_worker = RabbitMQWorker() self.ret_queue = self.publish_worker.connect(psettings.RABBITMQ_URL) def purge_all_queues(self): """ Purges all the content from all the queues existing in psettings.py. :return: no return """ for queue in psettings.RABBITMQ_ROUTES['QUEUES']: _q = queue['queue'] self.publish_worker.channel.queue_purge(queue=_q) def tearDown(self): """ General tearDown of the class. Purges the queues and then sleeps so that there is no contaminating the next set of tests. :return: no return """ self.purge_all_queues() time.sleep(5) def helper_get_details(self, test_publish): """ Generates a bunch of relevant information about the stub data being used. The attribute names should be relevant. :param test_publish: the file to the test stub file :return: no return """ with open(os.path.join(PROJ_HOME, test_publish), "r") as f: lines = f.readlines() self.nor = len(lines) self.bibcode, self.ft_source, self.provider = \ lines[0].strip().split('\t') self.bibcode_list = [i.strip().split('\t')[0] for i in lines] self.test_expected = check_if_extract.create_meta_path( {'bibcode': self.bibcode}, extract_key='FULLTEXT_EXTRACT_PATH_UNITTEST') self.meta_list = \ [check_if_extract.create_meta_path( {"bibcode": j}, extract_key='FULLTEXT_EXTRACT_PATH_UNITTEST' ).replace('meta.json', '') for j in self.bibcode_list] self.meta_path = self.test_expected.replace('meta.json', '') self.number_of_PDFs = len( list( filter(lambda x: x.lower().endswith('.pdf'), [i.strip().split("\t")[-2] for i in lines]))) self.number_of_standard_files = self.nor - self.number_of_PDFs def calculate_expected_folders(self, full_text_links): """ Determines the paths that should exist if the test data was extracted. :param full_text_links: file that contains the full text links stub data :return: list of expected paths that would be created when the full text was extracted """ with open(os.path.join(PROJ_HOME, full_text_links), "r") as inf: lines = inf.readlines() expected_paths = \ [check_if_extract.create_meta_path( {CONSTANTS['BIBCODE']: line.strip().split('\t')[0]}, extract_key='FULLTEXT_EXTRACT_PATH_UNITTEST' ).replace('meta.json', '') for line in lines] return expected_paths def clean_up_path(self, paths): """ Takes the path given and deletes any content that should have been created when the full text was extracted. :param paths: list of paths to clean up their content :return: no return """ for path in paths: if os.path.exists(path): meta = os.path.join(path, 'meta.json') fulltext = os.path.join(path, 'fulltext.txt') dataset = os.path.join(path, 'dataset.txt') acknowledgements = os.path.join(path, 'acknowledgements.txt') file_list = [meta, fulltext, dataset, acknowledgements] for file_ in file_list: if os.path.exists(file_): os.remove(file_) os.rmdir(path) print('deleted: {0} and its content'.format(path)) else: print('Could not delete {0}, does not exist'.format(path))
def main(MONGO=MONGO,*args): if args: sys.argv.extend(*args) parser = argparse.ArgumentParser() parser.add_argument( '--target-bibcodes', nargs='*', default=[], dest='targetBibcodes', help='Only analyze the specified bibcodes, and ignore their JSON fingerprints. Only works when --async=False. Use the syntax @filename.txt to read these from file (1 bibcode per file)' ) parser.add_argument( '--async', default=False, action='store_true', dest='async', help='start in async mode' ) parser.add_argument( '--dont-init-lookers-cache', default=False, action='store_true', dest='dont_init_lookers_cache', help='dont call ADSExports2.init_lookers_cache()' ) parser.add_argument( '--load-records-from-pickle', nargs='*', default=None, dest='load_records_from_pickle', help='Load XML records from a pickle instead of ADSExports', ) parser.add_argument( '--dump-output-to-file', nargs=1, type=str, default=None, dest='outfile', help='Output records to a file' ) parser.add_argument( '--ignore-json-fingerprints', default=False, action='store_true', dest='ignore_json_fingerprints', help='ignore json fingerprints when finding new records to update (ie, force update)', ) parser.add_argument( '--process-deletions', default=False, action='store_true', dest='process_deletions', help='Find orphaned bibcodes in the mongodb, then send these bibcodes to delete via rabbitMQ. No updates will be processed with this flag is set.', ) parser.add_argument( '--max-deletions', default=2000, type=int, dest='max_deletions', help='Maximum number of deletions to attempt; If over this limit, exit and log an error', ) args = parser.parse_args() if not args.dont_init_lookers_cache: start = time.time() logger.info("Calling init_lookers_cache()") ReadRecords.INIT_LOOKERS_CACHE() logger.info("init_lookers_cache() returned in %0.1f sec" % (time.time()-start)) records = readBibcodesFromFile(BIBCODE_FILES) targets = None if args.targetBibcodes: if args.targetBibcodes[0].startswith('@'): with open(args.targetBibcodes[0].replace('@','')) as fp: targetBibcodes = deque([L.strip() for L in fp.readlines() if L and not L.startswith('#')]) else: targetBibcodes = args.targetBibcodes targets = {bibcode:records[bibcode] for bibcode in targetBibcodes} records = deque(ReadRecords.canonicalize_records(records,targets)) total = float(len(records)) #Save to print later if args.ignore_json_fingerprints: records = deque([(r[0],'ignore') for r in records]) if args.process_deletions: start = time.time() logger.info("Processing deletions. This will block for several hours until the database is compared, then exit.") logger.warning("No updates will be processed when --process-deletions is set") mongo = MongoConnection.PipelineMongoConnection(**MONGO) mongo.close() results = mongo.getAllBibcodes() if len(results) != mongo.db[mongo.collection].count(): logger.warning("len getAllBibcodes (%s) != len count (%s). Continue anyways." % (len(results),mongo.db[mongo.collection].count())) records = [i[0] for i in records] payload = list(set(results).difference(set(records))) if len(payload) > args.max_deletions: logger.critical("|".join(payload)) logger.critical("Too many deletions: {} > {}".format(len(payload), args.max_deletions)) sys.exit(1) w = RabbitMQWorker() w.connect(psettings.RABBITMQ_URL) publish(w,payload,routing_key='DeletionRoute') logger.info("Found %s orphaned bibcodes in %0.1f seconds." % (len(payload),time.time()-start)) sys.exit(0) if not args.async: mongo = MongoConnection.PipelineMongoConnection(**MONGO) records = mongo.findNewRecords(records) if args.load_records_from_pickle: records = ReadRecords.readRecordsFromPickles(records,args.load_records_from_pickle) else: records = ReadRecords.readRecordsFromADSExports(records) merged = UpdateRecords.mergeRecords(records) if args.outfile: with open(args.outfile[0],'w') as fp: r = {'merged': merged, 'nonmerged': records} json.dump(r,fp,indent=1) else: bibcodes = mongo.upsertRecords(merged) #SolrUpdater.solrUpdate(bibcodes) elif args.async: w = RabbitMQWorker() w.connect(psettings.RABBITMQ_URL) lastLogged = None while records: payload = [] while len(payload) < BIBCODES_PER_JOB: try: payload.append( records.popleft() ) except IndexError: break percent = round((1-len(records)/total)*100.0) if not percent % 5 and percent!=lastLogged: lastLogged=percent logger.info("There are %s records left (%0.1f%% completed)" % (len(records),percent)) publish(w,payload)