def read_bibcode_file(self, bibcode_file_path): """ Function that read the list of bibcodes in one file: The bibcodes must be at the beginning of a row. """ printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) printmsg(self.verbose, "Reading %s \n" % bibcode_file_path) try: bibfile = open(bibcode_file_path, "rU") except IOError: sys.stdout.write("Input file not readable \n") raise GenericError('Mandatory file not readable. Please check %s \n' % bibcode_file_path) bibcodes_list = [] for bibrow in bibfile: if bibrow[0] != " ": bibrow_elements = bibrow.split('\t') bibcode = bibrow_elements[0].rstrip('\n') if bibcode != '': bibcodes_list.append(bibcode) bibfile.close() del bibfile #return the list of bibcodes return bibcodes_list
def rem_bibs_to_extr_del(self, extraction_dir): """method that finds the bibcodes to extract and to delete not processed in an extraction """ printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) #first I extract the list of bibcodes that I had to extract bibcodes_to_extract = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['new'])) #then the ones I had to delete bibcodes_to_delete = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['del'])) #then the ones that had problems during the extraction bibcodes_probl = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['prob'])) #finally the ones that have been extracted correctly bibcodes_done = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['done'])) bibcode_processed = list(set(bibcodes_probl).union(set(bibcodes_done))) #then I find the ones remaining to extract bibcodes_to_extract_remaining = list(set(bibcodes_to_extract) - set(bibcode_processed)) #then I find the ones remaining to delete bibcodes_to_delete_remaining = list(set(bibcodes_to_delete) - set(bibcode_processed)) #now I want the list of extraction ordered with first the preprint and then the other bibcodes #only if I have something remaining if len(bibcodes_to_extract_remaining) > 0: #I load the saved preprint file bibcodes_preprint = self.read_bibcode_file(os.path.join(settings.BASE_OUTPUT_PATH, extraction_dir, 'PRE_'+os.path.basename(settings.BIBCODES_PRE))) remaining_preprint = list(set(bibcodes_to_extract_remaining).intersection(set(bibcodes_preprint))) remaining_preprint.sort() other_remaining = list(set(bibcodes_to_extract_remaining) - set(remaining_preprint)) other_remaining.sort() bibcodes_to_extract_remaining = remaining_preprint + other_remaining return (bibcodes_to_extract_remaining, bibcodes_to_delete_remaining)
def problematic_extraction_process(q_probl, num_active_workers, lock_stdout, q_life, extraction_directory, verbose): """Worker that takes care of the bibcodes that couldn't be extracted and writes them to the related file""" while True: group_probl = q_probl.get() # first of all I check if the group I'm getting is a message from a process that finished if group_probl[0] == "WORKER DONE": num_active_workers = num_active_workers - 1 # if there are no active worker any more, I'm done with processing output if num_active_workers == 0: break else: # otherwise I process the output: # I puth the bibcodes in the file of the problematic bibcodes if len(group_probl[1]) > 0: w2f = write_files.WriteFile(extraction_directory, verbose) w2f.write_problem_bibcodes_to_file(group_probl[1]) lock_stdout.acquire() printmsg( True, multiprocessing.current_process().name + (" (problematic bibcodes worker) wrote problematic bibcodes for group %s \n" % group_probl[0]), ) lock_stdout.release() # I tell the manager that I'm done and I'm exiting q_life.put(["PROBLEMBIBS DONE"]) lock_stdout.acquire() printmsg(True, multiprocessing.current_process().name + " (problematic bibcodes worker) job finished: exiting \n") lock_stdout.release()
def init_stylesheet(self): """ Method that initialize the transformation engine """ printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) #create the stylesheet obj try: self.style_obj = libxslt.parseStylesheetDoc(libxml2.parseFile(self.stylesheet)) except: raise GenericError("ERROR: problem loading stylesheet \n") return True
def get_all_bibcodes(self): """Method that retrieves the complete list of bibcodes""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) # Timestamps ordered by increasing order of importance. timestamp_files_hierarchy = [settings.BIBCODES_GEN, settings.BIBCODES_PRE, settings.BIBCODES_PHY, settings.BIBCODES_AST ] bibcodes = set([]) for filename in timestamp_files_hierarchy: db_bibcodes = self.read_bibcode_file(filename) bibcodes = bibcodes.union(set(db_bibcodes)) bibcodes_list = list(bibcodes) bibcodes_list.sort() return bibcodes_list
def process_bibcodes_to_delete(self): """method that creates the MarcXML for the bibcodes to delete""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) # I create an unique file for all the bibcodes to delete: # I don't think it's necessary to split the content in groups, since the XML is really simple # I create the base object for the tree doc = libxml2.newDoc("1.0") root = doc.newChild(None, "collection", None) # then for each bibcode to delete I create the proper record for bibcode in self.bibcodes_to_delete_list: record = root.newChild(None, "record", None) # I add to the record the 2 necessary datafields d970 = record.newChild(None, "datafield", None) d970.setProp("tag", "970") d970.setProp("ind1", "") d970.setProp("ind1", "") # I create the subfield tag sub = d970.newChild(None, "subfield", bibcode.replace("&", "&")) sub.setProp("code", "a") d980 = record.newChild(None, "datafield", None) d980.setProp("tag", "980") d980.setProp("ind1", "") d980.setProp("ind1", "") # I create the subfield tag sub = d980.newChild(None, "subfield", "DELETED") sub.setProp("code", "c") # I extract the node marcxml_string = doc.serialize("UTF-8", 2) # I remove the data doc.freeDoc() del doc # I write to the file w2f = write_files.WriteFile(self.extraction_directory, self.verbose) filename_delete = w2f.write_bibcodes_to_delete_file( marcxml_string, self.bibcodes_to_delete_list, self.extraction_name ) if filename_delete: printmsg( self.verbose, "The MarcXML for the bibcode to delete has been written to the file %s \n" % filename_delete, ) else: raise GenericError("Impossible to create the file for the MarcXML of the bibcodes to delete") return True
def extr_diff_bibs_from_extraction(self, extraction_dir): """method that extracts the list of bibcodes not processed from a directory used for an extraction""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) #first I extract the list of bibcodes that I had to extract bibcodes_to_extract = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['new'])) #then the ones I had to delete bibcodes_to_delete = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['del'])) #then the ones that had problems during the extraction bibcodes_probl = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['prob'])) #finally the ones that have been extracted correctly bibcodes_done = self.read_bibcode_file(os.path.join(extraction_dir, settings.BASE_FILES['done'])) #then I extract the ones remaining bibcodes_remaining = list((set(bibcodes_to_extract).union(set(bibcodes_to_delete))) - (set(bibcodes_probl).union(set(bibcodes_done)))) return bibcodes_remaining
def write_done_bibcodes_to_file(self, bibcodes_list): """Method that writes a list of bibcodes in the file of the done bibcodes""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) filepath = os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, settings.BASE_FILES['done']) try: file_obj = open(filepath, 'a') for bibcode in bibcodes_list: file_obj.write(bibcode+'\n') file_obj.close() except: raise GenericError('Impossible to write in the "bibcode done file" %s \n' % filepath) return True
def manage(self): """public function""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) #If there is a wrong mode, I will raise an exception if self.mode != 'full' and self.mode != 'update': raise GenericError('Wrong parameter: the extraction can be only full or update') #otherwise I proceed else: #retrieve the list of bibcode to extract and the list of bibcodes to delete (bibcodes_to_extract_list, bibcodes_to_delete_list) = self.retrieve_bibcodes_to_extract() #call the extractor manager are = ads_record_extractor.ADSRecordExtractor(bibcodes_to_extract_list, bibcodes_to_delete_list, self.dirname, self.verbose) del bibcodes_to_extract_list del bibcodes_to_delete_list are.extract() return
def write_marcxml_file(self, xmlstring, taskname, extraction_name): """method that writes the marcXML to a file naming it in the proper way""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) filename = settings.MARCXML_FILE_BASE_NAME + '_' + extraction_name + '_'+ taskname + '.xml' filepath = os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, filename) printmsg(self.verbose, "Writing the MarcXML file %s \n" % filepath) #then I actually write the file try: file_obj = open(filepath,'w') file_obj.write(xmlstring) file_obj.close() except: return False del file_obj, xmlstring return filepath
def transform(self, doc): """ Method that actually make the transformation""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) #I load the stylesheet self.init_stylesheet() #transformation try: doc = self.style_obj.applyStylesheet(doc, None) except: printmsg(True, "ERROR: Transformation failed \n") return False #to string result = self.style_obj.saveResultToString(doc) #self.styleObj.freeStylesheet() doc.freeDoc() return result
def retrieve_bibcodes_to_extract(self): """method that retrieves the bibcodes that need to be extracted from ADS""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) #check the status of the last extraction status_last_extraction = self.check_last_extraction() if status_last_extraction == 'OK' or status_last_extraction == 'NOTHING FOUND' or status_last_extraction == 'NOT VALID DIRECTORY CONTENT': printmsg(self.verbose, "Last extraction was fine: proceeding with a new one \n") #I create directory and files of bibcodes to extract self.dirname = strftime("%Y_%m_%d-%H_%M_%S") os.mkdir(os.path.join(settings.BASE_OUTPUT_PATH, self.dirname), 0755) for filetype in settings.BASE_FILES: fileobj = open(os.path.join(os.path.join(settings.BASE_OUTPUT_PATH, self.dirname), settings.BASE_FILES[filetype]),'w') fileobj.write('') fileobj.close() # I write also the file to log the extraction name fileobj = open(os.path.join(os.path.join(settings.BASE_OUTPUT_PATH, self.dirname), settings.EXTRACTION_FILENAME_LOG),'w') fileobj.write('') fileobj.close() del fileobj #then I extract the list of bibcodes according to "mode" if self.mode == 'full': #if node == full I have to extrat all the bibcodes return self.extract_full_list_of_bibcodes() elif self.mode == 'update': return self.extract_update_list_of_bibcodes() else: printmsg(self.verbose, "Last extraction was not fine: recovering \n") #I retrieve the bibcodes missing from the last extraction self.dirname = self.lastest_extr_dir return self.rem_bibs_to_extr_del(os.path.join(settings.BASE_OUTPUT_PATH, self.lastest_extr_dir))
def write_bibcodes_to_delete_file(self, xmlstring, bibcodes_list, extraction_name): """method that writes the file with the bibcodes to delete and updates the file with the done bibcodes""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) #I build the complete path and filename for the file to extract filename = settings.BIBCODE_TO_DELETE_OUT_NAME + '_'+ extraction_name + '.xml' filepath = os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, filename) printmsg(self.verbose, "Writing the MarcXML file %s \n" % filepath) #then I actually write the file try: file_obj = open(filepath,'w') file_obj.write(xmlstring) file_obj.close() except: return False del file_obj, xmlstring #then I append the list of bibcodes actually written extracte to the "done file" bibdone_filename = os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, settings.BASE_FILES['done']) printmsg(self.verbose, 'Updating the "processed bibcodes" file %s \n' % bibdone_filename) try: file_obj = open(bibdone_filename, 'a') for bibcode in bibcodes_list: file_obj.write(bibcode+'\n') file_obj.close() except: raise GenericError('Impossible to write in the "bibcode done file" %s \n' % bibdone_filename) del file_obj, bibcodes_list return filepath
def extract(self): """manager of the extraction""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) ######################################################################## # part where the bibcode to delete are processed # I have to upload first the bibcodes to delete and then the others. # So I process them first if self.bibcodes_to_delete_list: try: self.process_bibcodes_to_delete() except Exception: printmsg(True, "Unable to process the bibcodes to delete \n") raise GenericError("Unable to process the bibcodes to delete \n") ######################################################################## # part where the bibcode to extract (new or update) are processed # I split the list of bibcodes to process in multiple groups bibtoprocess_splitted = self.grouper(settings.NUMBER_OF_BIBCODES_PER_GROUP, self.bibcodes_to_extract_list) # I define a manager for the workers manager = multiprocessing.Process( target=extractor_manager_process, args=(bibtoprocess_splitted, self.extraction_directory, self.extraction_name, self.verbose), ) # I start the process manager.start() # I join the process manager.join() printmsg(True, "Extraction ended! \n")
def done_extraction_process(q_done, num_active_workers, lock_stdout, q_life, extraction_directory, verbose): """Worker that takes care of the groups of bibcodes processed and writes the bibcodes to the related file NOTE: this can be also the process that submiths the upload processes to invenio """ while True: group_done = q_done.get() # first of all I check if the group I'm getting is a message from a process that finished if group_done[0] == "WORKER DONE": num_active_workers = num_active_workers - 1 # if there are no active worker any more, I'm done with processing output if num_active_workers == 0: break else: # otherwise I process the output: # I puth the bibcodes in the file of the done bibcodes if len(group_done[1]) > 0: w2f = write_files.WriteFile(extraction_directory, verbose) w2f.write_done_bibcodes_to_file(group_done[1]) lock_stdout.acquire() printmsg( True, multiprocessing.current_process().name + (" (done bibcodes worker) wrote done bibcodes for group %s \n" % group_done[0]), ) lock_stdout.release() # I call the procedure to submit to invenio the process to upload the file filename_path = group_done[2] # invenio.bibtask.task_low_level_submission http://bit.ly/nnQZbs # I tell the manager that I'm done and I'm exiting q_life.put(["DONEBIBS DONE"]) lock_stdout.acquire() printmsg(True, multiprocessing.current_process().name + " (done bibcodes worker) job finished: exiting \n") lock_stdout.release()
def extract_full_list_of_bibcodes(self): """ method that extracts the complete list of bibcodes it first extracts the list of arxiv bibcodes and then all the others """ printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) #first I extract the list of preprint preprint_bibcodes = self.read_bibcode_file(settings.BIBCODES_PRE) #I copy the preprint file, because I need a copy locally try: shutil.copy(settings.BIBCODES_PRE, os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, 'PRE_'+os.path.basename(settings.BIBCODES_PRE))) except: raise GenericError('Impossible to copy a mandatory file from %s to %s' % (settings.BIBCODES_PRE, os.path.join(settings.BASE_OUTPUT_PATH, self.dirname))) #then I extract the complete list #all_bibcodes = self.read_bibcode_file(settings.BIBCODES_ALL) all_bibcodes = self.get_all_bibcodes() not_pre_bibcodes = list(set(all_bibcodes) - set(preprint_bibcodes)) not_pre_bibcodes.sort() #I write these lists bibcodes to the file of bibcodes to extract #and in the meanwhile I create the list with first the preprint and then the published bibcode_file = open(os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, settings.BASE_FILES['new']), 'a') bibcode_to_extract = [] #first the preprints because they can be overwritten by the published ones for bibcode in preprint_bibcodes: bibcode_file.write(bibcode + '\n') bibcode_to_extract.append(bibcode) #then all the other bibcodes for bibcode in not_pre_bibcodes: bibcode_file.write(bibcode + '\n') bibcode_to_extract.append(bibcode) bibcode_file.close() del bibcode del bibcode_file printmsg(self.verbose, "Full list of bibcodes and related file generated \n") #finally I return the full list of bibcodes and an empty list for the bibcodes to delete return (bibcode_to_extract, [])
def set_extraction_name(self): """Method that sets the name of the current extraction""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) filepath = os.path.join(settings.BASE_OUTPUT_PATH, self.extraction_directory, settings.EXTRACTION_FILENAME_LOG) file_obj = open(filepath, "r") rows = file_obj.readlines() file_obj.close() if len(rows) > 0: last_name = rows[len(rows) - 1] number_ext = int(last_name.split(settings.EXTRACTION_BASE_NAME)[1]) number_ext = number_ext + 1 else: last_name = None number_ext = 1 extraction_name = settings.EXTRACTION_BASE_NAME + str(number_ext) # Then I write the number of extraction to the file file_obj = open(filepath, "a") file_obj.write(extraction_name + "\n") file_obj.close() return extraction_name
def extract_update_list_of_bibcodes(self): """Method that extracts the list of bibcodes to update""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) records_added, records_modified, records_deleted = timestamp_manager.get_records_status(self.verbose) bibcodes_to_extract = list(records_added) + list(records_modified) bibcodes_to_extract.sort() bibcodes_to_delete = list(records_deleted) bibcodes_to_delete.sort() #then I write all these bibcodes to the proper files #first the one to extract bibcode_file = open(os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, settings.BASE_FILES['new']), 'a') for bibcode in bibcodes_to_extract: bibcode_file.write(bibcode + '\n') bibcode_file.close() #then the one to delete bibcode_file = open(os.path.join(settings.BASE_OUTPUT_PATH, self.dirname, settings.BASE_FILES['del']), 'a') for bibcode in bibcodes_to_delete: bibcode_file.write(bibcode + '\n') bibcode_file.close() #I return the list of bibcodes to extract and the list of bibcodes to delete return (bibcodes_to_extract, bibcodes_to_delete)
def get_records_status(verbose=False): """ Return 3 sets of bibcodes: * bibcodes added are bibcodes that are in ADS and not in Invenio. * bibcodes modified are bibcodes that are both in ADS and in Invenio and that have been modified since the last update. * bibcodes deleted are bibcodes that are in Invenio but not in ADS. """ records_added = [] records_modified = [] records_deleted = [] printmsg(verbose, "Getting ADS timestamps. \n") ads_timestamps = _get_ads_timestamps() printmsg(verbose, "Getting ADS bibcodes. \n") ads_bibcodes = set(ads_timestamps.keys()) printmsg(verbose, "Getting Invenio bibcodes. \n") invenio_bibcodes = _get_invenio_bibcodes() printmsg(verbose, "Deducting the added records. \n") records_added = ads_bibcodes - invenio_bibcodes printmsg(verbose, " %d records to add." % len(records_added)) printmsg(verbose, "Deducting the deleted records. \n") records_deleted = invenio_bibcodes - ads_bibcodes printmsg(verbose, " %d records to delete." % len(records_deleted)) records_to_check = invenio_bibcodes - records_deleted printmsg(verbose, "Checking timestamps for %d records. \n" % len(records_to_check)) # TODO: This can probably be sped up by working with chunks of bibcodes # instead of single bibcodes. for bibcode in records_to_check: ads_timestamp = ads_timestamps[bibcode] invenio_recid = get_mysql_recid_from_aleph_sysno(bibcode) invenio_timestamp = get_fieldvalues(invenio_recid, "995__a") if not invenio_timestamp: # Maybe we could add instead of exiting. printmsg(True, "ERROR: Record %s in Invenio does not " "have a timestamp. \n" % bibcode) sys.exit(1) elif invenio_timestamp != ads_timestamp: records_modified.append(bibcode) printmsg(verbose, "Done.") return records_added, records_modified, records_deleted
def check_last_extraction(self): """method that checks if the last extraction finished properly""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) #I retrieve the list of entries in the output directory list_of_elements = os.listdir(settings.BASE_OUTPUT_PATH) #I extract only the directories directories = [] for elem in list_of_elements: if os.path.isdir(os.path.join(settings.BASE_OUTPUT_PATH, elem)): directories.append(elem) #I set a variable for the latest dir of extraction self.lastest_extr_dir = '' #if I don't have any result I return the proper status if len(directories) == 0: printmsg(self.verbose, "Checked last extraction: status returned NOTHING FOUND \n") return 'NOTHING FOUND' else: #I sort the directories in desc mode and I take the first one directories.sort(reverse=True) self.lastest_extr_dir = directories[0] printmsg(self.verbose, "Checking the directory %s \n" % os.path.join(settings.BASE_OUTPUT_PATH, self.lastest_extr_dir)) #I extract the content of the last extraction elements_from_last_extraction = os.listdir(os.path.join(settings.BASE_OUTPUT_PATH, self.lastest_extr_dir)) #then I check if all the mandatory files are there, otherwise for name in settings.BASE_FILES: if settings.BASE_FILES[name] not in elements_from_last_extraction: printmsg(self.verbose, "Checked last extraction: status returned NOT VALID DIRECTORY CONTENT \n") return 'NOT VALID DIRECTORY CONTENT' #if I pass all this checks the content is basically fine #But then I have to check if the lists of bibcodes are consistent: bibcodes extracted + bibcodes with problems = sum(bibcodes to extract) printmsg(self.verbose, "Checking if the list of bibcodes actually extracted is equal to the one I had to extract \n") bibcodes_still_pending = self.extr_diff_bibs_from_extraction(os.path.join(settings.BASE_OUTPUT_PATH, self.lastest_extr_dir)) if len(bibcodes_still_pending) == 0: printmsg(self.verbose, "All the bibcodes from the last extraction have been processed \n") else: printmsg(self.verbose, "Checked last extraction: status returned LATEST NOT ENDED CORRECTLY \n") return 'LATEST NOT ENDED CORRECTLY' #if everything is Ok I return it printmsg(self.verbose, "Checked last extraction: status returned OK \n") return 'OK'
def grouper(self, n, iterable): """method to split a list in multiple groups""" printmsg(self.verbose, "In function %s.%s \n" % (self.__class__.__name__, inspect.stack()[0][3])) args = [iter(iterable)] * n return list(([e for e in t if e != None] for t in itertools.izip_longest(*args)))
def extractor_process(q_todo, q_done, q_probl, lock_stdout, q_life, extraction_directory, extraction_name, verbose): """Worker function for the extraction of bibcodes from ADS it has been defined outside any class because it's more simple to treat with multiprocessing """ # I get the maximum number of groups I can process max_num_groups = settings.MAX_NUMBER_OF_GROUP_TO_PROCESS # variable used to know if I'm exiting because the queue is empty or because I reached the maximum number of groups to process queue_empty = False # while there is something to process or I reach the maximum number of groups I can process, I try to process for grpnum in range(max_num_groups): task_todo = q_todo.get() if task_todo[0] == "STOP": queue_empty = True # I exit the loop break # I print when I'm starting the extraction lock_stdout.acquire() printmsg( True, multiprocessing.current_process().name + (" (worker) starting to process group %s \n" % task_todo[0]) ) lock_stdout.release() ############ # then I process the bibcodes # I define a couple of lists where to store the bibcodes processed bibcodes_ok = [] bibcodes_probl = [] # I define a ADSEXPORT object recs = ads.ADSExports.ADSRecords("full", "XML") # I define a maximum amount of bibcodes I can skip per each cicle: the number of bibcodes per group / 10 (minimum 500) # if i skip more than this amount it means that there is something # wrong with the access to the data and it's better to stop everything max_number_of_bibs_to_skip = max(settings.NUMBER_OF_BIBCODES_PER_GROUP / 10, 500) for bibcode in task_todo[1]: try: recs.addRecord(bibcode) bibcodes_ok.append(bibcode) except: printmsg(True, 'ERROR: problem retrieving the bibcode "%s" \n' % bibcode) bibcodes_probl.append(bibcode) max_number_of_bibs_to_skip = max_number_of_bibs_to_skip - 1 # If i=I reach 0 It means that I skipped 1k bibcodes and probably there is a problem: so I simulate an exit for empty queue if max_number_of_bibs_to_skip == 0: break # I exit from both loops if max_number_of_bibs_to_skip == 0: lock_stdout.acquire() printmsg( True, multiprocessing.current_process().name + ( " (worker) Detected possible error with ADS data access: skipped %s bibcodes in one group \n" % max(settings.NUMBER_OF_BIBCODES_PER_GROUP / 10, 500) ), ) lock_stdout.release() queue_empty = True break # I extract the object I created xmlobj = recs.export() del recs try: # I define a transformation object transf = xml_transformer.XmlTransformer(verbose) # and I transform my object marcxml = transf.transform(xmlobj) except: raise GenericError("Impossible to transform the XML!") # if the transformation was ok, I write the file if marcxml: w2f = write_files.WriteFile(extraction_directory, verbose) wrote_filename = w2f.write_marcxml_file(marcxml, task_todo[0], extraction_name) # if the writing of the xml is wrong I consider all the bibcodes problematic if not wrote_filename: bibcodes_probl = bibcodes_probl + bibcodes_ok bibcodes_ok = [] del w2f # otherwise I put all the bibcodes in the problematic else: bibcodes_probl = bibcodes_probl + bibcodes_ok bibcodes_ok = [] wrote_filename = False # finally I pass to the done bibcodes to the proper file q_done.put([task_todo[0], bibcodes_ok, wrote_filename]) # and the problematic bibcodes q_probl.put([task_todo[0], bibcodes_probl]) lock_stdout.acquire() printmsg( True, multiprocessing.current_process().name + (" (worker) finished to process group %s \n" % task_todo[0]) ) lock_stdout.release() if queue_empty: # I tell the output processes that I'm done q_done.put(["WORKER DONE"]) q_probl.put(["WORKER DONE"]) # I tell the manager that I'm dying because the queue is empty q_life.put(["QUEUE EMPTY"]) # I set a variable to skip the messages outside the loop lock_stdout.acquire() printmsg(True, multiprocessing.current_process().name + " (worker) Queue empty: exiting \n") lock_stdout.release() else: # I tell the manager that I'm dying because I reached the maximum amount of group to process q_life.put(["MAX LIFE REACHED"]) lock_stdout.acquire() printmsg( True, multiprocessing.current_process().name + " (worker) Maximum amount of groups of bibcodes reached: exiting \n", ) lock_stdout.release()
def extractor_manager_process(bibtoprocess_splitted, extraction_directory, extraction_name, verbose): """Process that takes care of managing all the other worker processes this process also creates new worker processes when the existing ones reach the maximum number of groups of bibcode to process """ # a queue for the bibcodes to process q_todo = multiprocessing.Queue() # a queue for the bibcodes processed q_done = multiprocessing.Queue() # a queue for the bibcodes with problems q_probl = multiprocessing.Queue() # a lock to write in stdout lock_stdout = multiprocessing.Lock() # a queue for the messages from the workers that have to tell the manager when they reach the maximum number of chunks to process q_life = multiprocessing.Queue() lock_stdout.acquire() printmsg(verbose, multiprocessing.current_process().name + " (Manager) Filling the queue with the tasks \n") lock_stdout.release() # I split all the bibcodes in groups of NUMBER_OF_BIBCODES_PER_GROUP and I put them in the todo queue counter = 0 # I need the counter to uniquely identify each group for grp in bibtoprocess_splitted: counter += 1 q_todo.put([str(counter).zfill(7), grp]) lock_stdout.acquire() printmsg(verbose, multiprocessing.current_process().name + " (Manager) Creating the first pool of workers \n") lock_stdout.release() # I define the number of processes to run number_of_processes = settings.NUMBER_WORKERS # in production should be a part of multiprocessing.cpu_count # I define the worker processes processes = [ multiprocessing.Process( target=extractor_process, args=(q_todo, q_done, q_probl, lock_stdout, q_life, extraction_directory, extraction_name, verbose), ) for i in range(number_of_processes) ] # I append to the todo queue a list of commands to stop the worker processes for i in range(number_of_processes): q_todo.put(["STOP", ""]) lock_stdout.acquire() printmsg(verbose, multiprocessing.current_process().name + " (Manager) Creating the output workers \n") lock_stdout.release() # I define a "done bibcode" worker donebib = multiprocessing.Process( target=done_extraction_process, args=(q_done, number_of_processes, lock_stdout, q_life, extraction_directory, verbose), ) # I define a "problematic bibcode" worker problbib = multiprocessing.Process( target=problematic_extraction_process, args=(q_probl, number_of_processes, lock_stdout, q_life, extraction_directory, verbose), ) lock_stdout.acquire() printmsg(verbose, multiprocessing.current_process().name + " (Manager) Starting all the workers \n") lock_stdout.release() # I start the worker processes for p in processes: p.start() # and the output handlers donebib.start() problbib.start() # I join all the processes # for p in processes: # p.join() # donebib.join() # problbib.join() # then I have to wait for the workers that have to tell me if they reached the maximum amount of chunk to process or if the extraction ended # in the first case I have to start another process # in the second I have to decrease the counter of active workers active_workers = settings.NUMBER_WORKERS additional_workers = 2 while active_workers > 0 or additional_workers > 0: # I get the message from the worker death_reason = q_life.get() # if the reason of the death is that the process reached the max number of groups to process, then I have to start another one if death_reason[0] == "MAX LIFE REACHED": newprocess = multiprocessing.Process( target=extractor_process, args=(q_todo, q_done, q_probl, lock_stdout, q_life, extraction_directory, extraction_name, verbose), ) newprocess.start() additional_workers = additional_workers - 1 lock_stdout.acquire() printmsg(True, multiprocessing.current_process().name + " (Manager) New worker created \n") lock_stdout.release() elif death_reason[0] == "QUEUE EMPTY": active_workers = active_workers - 1 lock_stdout.acquire() printmsg( verbose, multiprocessing.current_process().name + " (Manager) %s workers waiting to finish their job \n" % str(active_workers), ) lock_stdout.release() elif death_reason[0] == "PROBLEMBIBS DONE": additional_workers = additional_workers - 1 lock_stdout.acquire() printmsg( verbose, multiprocessing.current_process().name + " (Manager) %s additional workers waiting to finish their job \n" % str(additional_workers), ) lock_stdout.release() elif death_reason[0] == "DONEBIBS DONE": additional_workers = additional_workers - 1 lock_stdout.acquire() printmsg( verbose, multiprocessing.current_process().name + " (Manager) %s additional workers waiting to finish their job \n" % str(additional_workers), ) lock_stdout.release() lock_stdout.acquire() printmsg(verbose, multiprocessing.current_process().name + " (Manager) All the workers are done. Exiting... \n") lock_stdout.release()