def merge_bibcodes(bibcodes, print_adsxml=False, print_marcxml=False, write_xml_to_disk=False): """ Returns a merged version of the record identified by bibcode. """ # Extract the record from ADS. records = ADSRecords('full', 'XML') for bibcode in bibcodes: records.addCompleteRecord(bibcode) ads_xml_obj = records.export() if print_adsxml: print ads_xml_obj.serialize('UTF-8') if write_xml_to_disk: with open('/tmp/adsxml.xml', 'w') as f: f.write(ads_xml_obj.serialize('UTF-8')) # Convert to MarcXML. stylesheet = libxslt.parseStylesheetDoc(libxml2.parseFile(XSLT)) xml_object = stylesheet.applyStylesheet(ads_xml_obj, None) if print_marcxml: print xml_object.serialize('UTF-8') if write_xml_to_disk: with open('/tmp/marcxml.xml', 'w') as f: f.write(xml_object.serialize('UTF-8')) merged_records, bibcodes_with_problems = merge_records_xml(xml_object) return merged_records
def readRecords(records,LOGGER=settings.LOGGER): ''' records: [(bibcode,JSON_fingerprint),...] ''' h = hash(json.dumps(records)) if not records: LOGGER.debug("No records given") return [] targets = dict(records) s = time.time() records = ADSRecords('full','XML') failures = [] for bibcode in targets.keys(): try: records.addCompleteRecord(bibcode) except KeyboardInterrupt: raise except: failures.append(bibcode) LOGGER.warning("[%s] ADSRecords failed" % bibcode) records = records.export() if not records.content: return [] ttc = time.time()-s rate = len(targets)/ttc if failures: LOGGER.warning('ADSRecords failed to retrieve %s records' % len(failures)) LOGGER.info('ADSRecords took %0.1fs to query %s records (%0.1f rec/s)\t[%s]' % (ttc,len(targets),rate,h)) records = ensureList(xmltodict.parse(records.__str__())['records']['record']) assert(len(records)==len(targets)-len(failures)) # with open('%s.pickle' % uuid.uuid4(),'w') as fp: # pickle.dump(records,fp) return records,targets
def extractor_process(q_todo, q_done, q_probl, q_uplfile, lock_stdout, lock_createdfiles, q_life, extraction_directory, extraction_name): """Worker function for the extraction of bibcodes from ADS it has been defined outside any class because it's more simple to treat with multiprocessing """ logger.warning(multiprocessing.current_process().name + ' (worker) Process started') #I create a local logger fh = logging.FileHandler(os.path.join(pipeline_settings.BASE_OUTPUT_PATH, extraction_directory, pipeline_settings.BASE_LOGGING_PATH, multiprocessing.current_process().name+'_worker.log')) fmt = logging.Formatter(pipeline_settings.LOGGING_FORMAT) fh.setFormatter(fmt) local_logger = logging.getLogger(pipeline_settings.LOGGING_WORKER_NAME) local_logger.addHandler(fh) local_logger.setLevel(logger.level) local_logger.propagate = False #I print the same message for the local logger local_logger.warning(multiprocessing.current_process().name + ' Process started') #I remove the automatic join from the queue of the files to upload q_uplfile.cancel_join_thread() #I get the maximum number of groups I can process max_num_groups = settings.MAX_NUMBER_OF_GROUP_TO_PROCESS #variable used to know if I'm exiting because the queue is empty or because I reached the maximum number of groups to process queue_empty = False #while there is something to process or I reach the maximum number of groups I can process, I try to process for grpnum in range(max_num_groups): task_todo = q_todo.get() if task_todo[0] == 'STOP': queue_empty = True #I exit the loop break #I print when I'm starting the extraction local_logger.warning(multiprocessing.current_process().name + (' starting to process group %s' % task_todo[0])) ############ #then I process the bibcodes # I define a couple of lists where to store the bibcodes processed bibcodes_ok = [] bibcodes_probl = [] #I define a ADSEXPORT object recs = ADSRecords('full', 'XML') # I define a maximum amount of bibcodes I can skip per each cicle: the number of bibcodes per group / 10 (minimum 500) # if i skip more than this amount it means that there is something # wrong with the access to the data and it's better to stop everything max_number_of_bibs_to_skip = max(settings.NUMBER_OF_BIBCODES_PER_GROUP / 10, settings.MAX_SKIPPED_BIBCODES) for bibcode in task_todo[1]: try: recs.addCompleteRecord(bibcode) bibcodes_ok.append(bibcode) except Exception, error: local_logger.error(': problem retrieving the bibcode "%s" in group %s' % (bibcode, task_todo[0])) #I catch the exception type name exc_type, exc_obj, exc_tb = sys.exc_info() try: str_error_to_print = exc_type.__name__ + '\t' + str(error) except: try: str_error_to_print = u'%s\t%s' % (unicode(exc_type.__name__), unicode(error)) except: local_logger.error(' Cannot log error for bibcode %s ' % bibcode) str_error_to_print = '' bibcodes_probl.append((bibcode, str_error_to_print)) max_number_of_bibs_to_skip = max_number_of_bibs_to_skip - 1 #If i=I reach 0 It means that I skipped 1k bibcodes and probably there is a problem: so I simulate an exit for empty queue if max_number_of_bibs_to_skip == 0: break #I exit from both loops if max_number_of_bibs_to_skip == 0: local_logger.warning(' Detected possible error with ADS data access: skipped %s bibcodes in one group' % max(settings.NUMBER_OF_BIBCODES_PER_GROUP / 10, settings.MAX_SKIPPED_BIBCODES)) queue_empty = True break #I extract the object I created xmlobj = recs.export() del recs try: #I define a transformation object transf = xml_transformer.XmlTransformer(local_logger) #and I transform my object marcxml = transf.transform(xmlobj) except: err_msg = ' Impossible to transform the XML!' local_logger.critical(err_msg) raise GenericError(err_msg) if marcxml: #I merge the records merged_records, records_with_merging_probl = merger.merge_records_xml(marcxml) #If I had problems to merge some records I remove the bibcodes from the list "bibcodes_ok" and I add them to "bibcodes_probl" for elem in records_with_merging_probl: try: bibcodes_ok.remove(elem[0]) except ValueError: local_logger.warning(' Problems to remove bibcode "%s" in group "%s" from the list of bibcodes extracted after merging' % (elem[0], task_todo[0]) ) if elem[0] in bibcodes_probl: local_logger.error(': bibcode "%s" reached the merger but was in problematic bibcodes!' % elem[0]) bibcodes_probl = bibcodes_probl + records_with_merging_probl ######### #I write the object in a file ########## filepath = os.path.join(settings.BASE_OUTPUT_PATH, extraction_directory, pipeline_settings.BASE_BIBRECORD_FILES_DIR, pipeline_settings.BIBREC_FILE_BASE_NAME+'_'+extraction_name+'_'+task_todo[0]) output = open(filepath, 'wb') pickle.dump(merged_records, output) output.close() #then I write the filepath to a file for eventual future recovery lock_createdfiles.acquire() bibrec_file_obj = open(os.path.join(settings.BASE_OUTPUT_PATH, extraction_directory,settings.LIST_BIBREC_CREATED), 'a') bibrec_file_obj.write(filepath + '\n') bibrec_file_obj.close() lock_createdfiles.release() #finally I append the file to the queue local_logger.info('Insert in queue for upload the file "%s" of the group "%s" ' % (filepath, task_todo[0])) q_uplfile.put((task_todo[0],filepath)) #logger.info('record created, merged but not uploaded') #bibupload_merger(merged_records, local_logger, 'replace_or_insert') #otherwise I put all the bibcodes in the problematic else: bibcodes_probl = bibcodes_probl + [(bib, 'Bibcode extraction ok, but xml generation failed') for bib in bibcodes_ok] bibcodes_ok = [] #finally I pass to the done bibcodes to the proper file q_done.put([task_todo[0], bibcodes_ok]) #and the problematic bibcodes q_probl.put([task_todo[0], bibcodes_probl]) local_logger.warning(multiprocessing.current_process().name + (' finished to process group %s' % task_todo[0]))
def updateRecords(records,LOGGER=settings.LOGGER): if not records: LOGGER.debug("No records given") return [] targets = dict(records) s = time.time() records = ADSRecords('full','XML') failures = [] for bibcode in targets.keys(): try: records.addCompleteRecord(bibcode) except KeyboardInterrupt: raise except: failures.append(bibcode) LOGGER.warning("[%s] ADSRecords failed" % bibcode) records = records.export() if not records.content: return [] ttc = time.time()-s rate = len(targets)/ttc if failures: LOGGER.warning('ADSRecords failed to retrieve %s records' % len(failures)) LOGGER.info('ADSRecords took %0.1fs to query %s records (%0.1f rec/s)' % (ttc,len(targets),rate)) records = ensureList(xmltodict.parse(records.__str__())['records']['record']) with open('raw.txt','a') as fp: for r in records: fp.write('%s' % r) fp.write('\n\n') assert(len(records)==len(targets)-len(failures)) #Could send these tasks out on a queue completeRecords = [] for r in records: #Define top-level schema that will go in mongo cr = { 'bibcode': r['@bibcode'], 'JSON_fingerprint': targets[r['@bibcode']], 'metadata' : {}, } #Find metadata blocks that need merging metadataCounter = collections.Counter([entry['@type'] for entry in r['metadata']]) needsMerging = dict([(k,[]) for k,v in metadataCounter.iteritems() if v>1]) #Iterate over metadata blocks; directly input single defined blocks #and build a 'needsMerging' list to merge in the next step for metadataBlock in r['metadata']: for field,data in metadataBlock.iteritems(): if field in NORMALIZE_SCHEMA: metadataBlock[field] = NORMALIZE_SCHEMA[field](data) if metadataBlock['@type'] not in needsMerging: cr['metadata'].update({metadataBlock['@type']:metadataBlock}) else: #If it shows up more than once, it needs merging. needsMerging[metadataBlock['@type']].append(metadataBlock) #Now merge the multiply defined metadataBlocks for entryType,data in needsMerging.iteritems(): cr['metadata'].update({entryType:merge(data,r['@bibcode'],entryType,LOGGER)}) #Finally, we have a complete record completeRecords.append(enforceSchema(cr)) LOGGER.info('Added %s complete records' % len(completeRecords)) return completeRecords