Example #1
0
def merge_bibcodes(bibcodes, print_adsxml=False, print_marcxml=False, write_xml_to_disk=False):
    """
    Returns a merged version of the record identified by bibcode.
    """
    # Extract the record from ADS.
    records = ADSRecords('full', 'XML')
    for bibcode in bibcodes:
        records.addCompleteRecord(bibcode)
    ads_xml_obj = records.export()
    
    if print_adsxml:
        print ads_xml_obj.serialize('UTF-8')
    if write_xml_to_disk:
        with open('/tmp/adsxml.xml', 'w') as f:
            f.write(ads_xml_obj.serialize('UTF-8'))
    
    # Convert to MarcXML.
    stylesheet = libxslt.parseStylesheetDoc(libxml2.parseFile(XSLT))
    xml_object = stylesheet.applyStylesheet(ads_xml_obj, None)
    
    if print_marcxml:
        print xml_object.serialize('UTF-8')
    if write_xml_to_disk:
        with open('/tmp/marcxml.xml', 'w') as f:
            f.write(xml_object.serialize('UTF-8'))
    
    merged_records, bibcodes_with_problems = merge_records_xml(xml_object)
    return merged_records
Example #2
0
def readRecords(records,LOGGER=settings.LOGGER):
  '''
  records: [(bibcode,JSON_fingerprint),...]
  '''
  h = hash(json.dumps(records))
  if not records:
    LOGGER.debug("No records given")
    return []

  targets = dict(records)

  s = time.time()
  records = ADSRecords('full','XML')
  failures = []
  for bibcode in targets.keys():
    try:
      records.addCompleteRecord(bibcode)
    except KeyboardInterrupt:
      raise
    except:
      failures.append(bibcode)
      LOGGER.warning("[%s] ADSRecords failed" % bibcode)
  records = records.export()
  if not records.content:
    return []
  ttc = time.time()-s
  rate = len(targets)/ttc
  if failures:
    LOGGER.warning('ADSRecords failed to retrieve %s records' % len(failures))
  LOGGER.info('ADSRecords took %0.1fs to query %s records (%0.1f rec/s)\t[%s]' % (ttc,len(targets),rate,h))

  records = ensureList(xmltodict.parse(records.__str__())['records']['record'])
  assert(len(records)==len(targets)-len(failures))

  # with open('%s.pickle' % uuid.uuid4(),'w') as fp:
  #   pickle.dump(records,fp)
  return records,targets
def extractor_process(q_todo, q_done, q_probl, q_uplfile, lock_stdout, lock_createdfiles, q_life, extraction_directory, extraction_name):
    """Worker function for the extraction of bibcodes from ADS
        it has been defined outside any class because it's more simple to treat with multiprocessing """
    logger.warning(multiprocessing.current_process().name + ' (worker) Process started')
    #I create a local logger
    fh = logging.FileHandler(os.path.join(pipeline_settings.BASE_OUTPUT_PATH, extraction_directory, pipeline_settings.BASE_LOGGING_PATH, multiprocessing.current_process().name+'_worker.log'))
    fmt = logging.Formatter(pipeline_settings.LOGGING_FORMAT)
    fh.setFormatter(fmt)
    local_logger = logging.getLogger(pipeline_settings.LOGGING_WORKER_NAME)
    local_logger.addHandler(fh)
    local_logger.setLevel(logger.level)
    local_logger.propagate = False
    #I print the same message for the local logger
    local_logger.warning(multiprocessing.current_process().name + ' Process started')
    
    #I remove the automatic join from the queue of the files to upload
    q_uplfile.cancel_join_thread()
    
    #I get the maximum number of groups I can process
    max_num_groups = settings.MAX_NUMBER_OF_GROUP_TO_PROCESS
    #variable used to know if I'm exiting because the queue is empty or because I reached the maximum number of groups to process
    queue_empty = False

    #while there is something to process or I reach the maximum number of groups I can process,  I try to process
    for grpnum in range(max_num_groups):

        task_todo = q_todo.get()
        if task_todo[0] == 'STOP':

            queue_empty = True
            #I exit the loop
            break

        #I print when I'm starting the extraction
        local_logger.warning(multiprocessing.current_process().name + (' starting to process group %s' % task_todo[0]))

        ############
        #then I process the bibcodes
        # I define a couple of lists where to store the bibcodes processed
        bibcodes_ok = []
        bibcodes_probl = []

        #I define a ADSEXPORT object
        recs = ADSRecords('full', 'XML')

        # I define a maximum amount of bibcodes I can skip per each cicle: the number of bibcodes per group / 10 (minimum 500)
        # if i skip more than this amount it means that there is something
        # wrong with the access to the data and it's better to stop everything
        max_number_of_bibs_to_skip = max(settings.NUMBER_OF_BIBCODES_PER_GROUP / 10, settings.MAX_SKIPPED_BIBCODES)

        for bibcode in task_todo[1]:
            try:
                recs.addCompleteRecord(bibcode)
                bibcodes_ok.append(bibcode)
            except Exception, error:
                local_logger.error(': problem retrieving the bibcode "%s" in group %s' % (bibcode, task_todo[0]))
                #I catch the exception type name
                exc_type, exc_obj, exc_tb = sys.exc_info()
                try:
                    str_error_to_print = exc_type.__name__ + '\t' + str(error)
                except:
                    try:
                        str_error_to_print = u'%s\t%s' % (unicode(exc_type.__name__), unicode(error))
                    except:
                        local_logger.error(' Cannot log error for bibcode %s ' % bibcode)
                        str_error_to_print = ''
                bibcodes_probl.append((bibcode, str_error_to_print))
                max_number_of_bibs_to_skip = max_number_of_bibs_to_skip - 1
            #If i=I reach 0 It means that I skipped 1k bibcodes and probably there is a problem: so I simulate an exit for empty queue
            if max_number_of_bibs_to_skip == 0:
                break
        #I exit from both loops
        if max_number_of_bibs_to_skip == 0:
            local_logger.warning(' Detected possible error with ADS data access: skipped %s bibcodes in one group' % max(settings.NUMBER_OF_BIBCODES_PER_GROUP / 10, settings.MAX_SKIPPED_BIBCODES))
            queue_empty = True
            break

        #I extract the object I created
        xmlobj = recs.export()
        del recs

        try:
            #I define a transformation object
            transf = xml_transformer.XmlTransformer(local_logger)
            #and I transform my object
            marcxml = transf.transform(xmlobj)
        except:
            err_msg = ' Impossible to transform the XML!'
            local_logger.critical(err_msg)
            raise GenericError(err_msg)

        if marcxml:
            #I merge the records
            merged_records, records_with_merging_probl = merger.merge_records_xml(marcxml)
            #If I had problems to merge some records I remove the bibcodes from the list "bibcodes_ok" and I add them to "bibcodes_probl"
            for elem in records_with_merging_probl:
                try:
                    bibcodes_ok.remove(elem[0])
                except ValueError:
                    local_logger.warning(' Problems to remove bibcode "%s" in group "%s" from the list of bibcodes extracted after merging' % (elem[0], task_todo[0]) )
                    if elem[0] in bibcodes_probl:
                        local_logger.error(': bibcode "%s" reached the merger but was in problematic bibcodes!' % elem[0])
            bibcodes_probl = bibcodes_probl + records_with_merging_probl
            #########
            #I write the object in a file
            ##########
            filepath = os.path.join(settings.BASE_OUTPUT_PATH, extraction_directory, pipeline_settings.BASE_BIBRECORD_FILES_DIR, pipeline_settings.BIBREC_FILE_BASE_NAME+'_'+extraction_name+'_'+task_todo[0])
            output = open(filepath, 'wb')
            pickle.dump(merged_records, output)
            output.close()
            #then I write the filepath to a file for eventual future recovery
            lock_createdfiles.acquire()
            bibrec_file_obj = open(os.path.join(settings.BASE_OUTPUT_PATH, extraction_directory,settings.LIST_BIBREC_CREATED), 'a')
            bibrec_file_obj.write(filepath + '\n')
            bibrec_file_obj.close()
            lock_createdfiles.release()
            #finally I append the file to the queue
            local_logger.info('Insert in queue for upload the file "%s" of the group "%s" ' % (filepath, task_todo[0]))
            q_uplfile.put((task_todo[0],filepath))
            
            #logger.info('record created, merged but not uploaded')
            #bibupload_merger(merged_records, local_logger, 'replace_or_insert')
            
            
        #otherwise I put all the bibcodes in the problematic
        else:
            bibcodes_probl = bibcodes_probl + [(bib, 'Bibcode extraction ok, but xml generation failed') for bib in bibcodes_ok]
            bibcodes_ok = []
        
        
        #finally I pass to the done bibcodes to the proper file
        q_done.put([task_todo[0], bibcodes_ok])
        #and the problematic bibcodes
        q_probl.put([task_todo[0], bibcodes_probl])

        local_logger.warning(multiprocessing.current_process().name + (' finished to process group %s' % task_todo[0]))
Example #4
0
def updateRecords(records,LOGGER=settings.LOGGER):

  if not records:
    LOGGER.debug("No records given")
    return []

  targets = dict(records)

  s = time.time()
  records = ADSRecords('full','XML')
  failures = []
  for bibcode in targets.keys():
    try:
      records.addCompleteRecord(bibcode)
    except KeyboardInterrupt:
      raise
    except:
      failures.append(bibcode)
      LOGGER.warning("[%s] ADSRecords failed" % bibcode)
  records = records.export()
  if not records.content:
    return []
  ttc = time.time()-s
  rate = len(targets)/ttc
  if failures:
    LOGGER.warning('ADSRecords failed to retrieve %s records' % len(failures))
  LOGGER.info('ADSRecords took %0.1fs to query %s records (%0.1f rec/s)' % (ttc,len(targets),rate))

  records = ensureList(xmltodict.parse(records.__str__())['records']['record'])
  with open('raw.txt','a') as fp:
    for r in records:
      fp.write('%s' % r)
      fp.write('\n\n')
  assert(len(records)==len(targets)-len(failures))

  #Could send these tasks out on a queue
  completeRecords = []
  for r in records:
    #Define top-level schema that will go in mongo
    cr = {
      'bibcode': r['@bibcode'],
      'JSON_fingerprint': targets[r['@bibcode']],
      'metadata' : {},
    }

    #Find metadata blocks that need merging
    metadataCounter = collections.Counter([entry['@type'] for entry in r['metadata']])
    needsMerging = dict([(k,[]) for k,v in metadataCounter.iteritems() if v>1])

    #Iterate over metadata blocks; directly input single defined blocks
    #and build a 'needsMerging' list to merge in the next step
    for metadataBlock in r['metadata']: 
      for field,data in metadataBlock.iteritems():
        if field in NORMALIZE_SCHEMA:
          metadataBlock[field] = NORMALIZE_SCHEMA[field](data)
      if metadataBlock['@type'] not in needsMerging:
        cr['metadata'].update({metadataBlock['@type']:metadataBlock})
      else: #If it shows up more than once, it needs merging.
        needsMerging[metadataBlock['@type']].append(metadataBlock)
    #Now merge the multiply defined metadataBlocks
    for entryType,data in needsMerging.iteritems():
      cr['metadata'].update({entryType:merge(data,r['@bibcode'],entryType,LOGGER)})
    
    #Finally, we have a complete record
    completeRecords.append(enforceSchema(cr))

  LOGGER.info('Added %s complete records' % len(completeRecords))
  return completeRecords