Esempio n. 1
def static_file_merging():
    """runs the record merger from a static XML in a file bypassing the extraction"""
    static_file = "misc/2011ApJ...741...91C.xml"
    #static_file = "misc/1999PASP..111..438F.xml"
    #static_file = "misc/1984A&A...130...97L.xml"
    return merge_records_xml(libxml2.parseDoc(open(static_file, "r").read()))
Esempio n. 2
def merge_bibcodes(bibcodes, print_adsxml=False, print_marcxml=False, write_xml_to_disk=False):
    Returns a merged version of the record identified by bibcode.
    # Extract the record from ADS.
    records = ADSRecords('full', 'XML')
    for bibcode in bibcodes:
    ads_xml_obj = records.export()
    if print_adsxml:
        print ads_xml_obj.serialize('UTF-8')
    if write_xml_to_disk:
        with open('/tmp/adsxml.xml', 'w') as f:
    # Convert to MarcXML.
    stylesheet = libxslt.parseStylesheetDoc(libxml2.parseFile(XSLT))
    xml_object = stylesheet.applyStylesheet(ads_xml_obj, None)
    if print_marcxml:
        print xml_object.serialize('UTF-8')
    if write_xml_to_disk:
        with open('/tmp/marcxml.xml', 'w') as f:
    merged_records, bibcodes_with_problems = merge_records_xml(xml_object)
    return merged_records
    def test_01_merge_two_records_one_field(self):
        PRIORITY: 2 records, 1 field, 2 origins.
        marcxml = """<collections><collection>
    <datafield tag="300" ind1=" " ind2=" ">
      <subfield code="a">10</subfield>
      <subfield code="7">A&amp;A</subfield>
    <datafield tag="980" ind1="" ind2="">
        <subfield code="a">ASTRONOMY</subfield>
        <subfield code="7">ADS metadata</subfield>
    <datafield tag="300" ind1=" " ind2=" ">
      <subfield code="a">15</subfield>
      <subfield code="7">NED</subfield>
        expected = """<collections><collection><record>
  <datafield tag="300" ind1=" " ind2=" ">
    <subfield code="a">10</subfield>
    <subfield code="7">A&amp;A</subfield>
  <datafield tag="980" ind1="" ind2="">
        <subfield code="a">ASTRONOMY</subfield>
        <subfield code="7">ADS metadata</subfield>
        merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0]
        self.assertEqual(merged_record, create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0])
    def test_04_merge_three_records_two_fields(self):
        3 records, 6 fields, 6 origins.
        marcxml = """<collections><collection>
    <datafield tag="300" ind1=" " ind2=" ">
      <subfield code="a">10</subfield>
      <subfield code="7">A&amp;A</subfield>
    <datafield tag="773" ind1=" " ind2=" ">
      <subfield code="a">Libération</subfield>
      <subfield code="7">STI</subfield>
    <datafield tag="980" ind1="" ind2="">
      <subfield code="a">ASTRONOMY</subfield>
      <subfield code="7">ADS metadata</subfield>
    <datafield tag="773" ind1=" " ind2=" ">
      <subfield code="a">Le Monde</subfield>
      <subfield code="7">AAS</subfield>
    <datafield tag="300" ind1=" " ind2=" ">
      <subfield code="a">15</subfield>
      <subfield code="7">NED</subfield>
    <datafield tag="300" ind1=" " ind2=" ">
      <subfield code="a">5</subfield>
      <subfield code="7">ADS metadata</subfield>
    <datafield tag="773" ind1=" " ind2=" ">
      <subfield code="a">L'Express</subfield>
      <subfield code="7">OCR</subfield>
        expected = """<collections><collection><record>
  <datafield tag="300" ind1=" " ind2=" ">
    <subfield code="a">5</subfield>
    <subfield code="7">ADS metadata</subfield>
  <datafield tag="773" ind1=" " ind2=" ">
    <subfield code="a">Le Monde</subfield>
    <subfield code="7">AAS</subfield>
  <datafield tag="980" ind1="" ind2="">
    <subfield code="a">ASTRONOMY</subfield>
    <subfield code="7">ADS metadata</subfield>
        merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0]
        self.assertEqual(merged_record, create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0])
    def test_02_merge_two_records_additional_subfield(self):
        AUTHORS: 2 records, 1 additional subfield.
        marcxml = """<collections><collection>
    <datafield tag="100" ind1=" " ind2=" ">
      <subfield code="a">Di Milia, Giovanni</subfield>
      <subfield code="b">Di Milia, G</subfield>
      <subfield code="7">A&amp;A</subfield>
    <datafield tag="980" ind1="" ind2="">
      <subfield code="a">ASTRONOMY</subfield>
      <subfield code="7">ADS metadata</subfield>
    <datafield tag="100" ind1=" " ind2=" ">
      <subfield code="a">Di Milia, Giancarlo</subfield>
      <subfield code="b">Di Milia, G</subfield>
      <subfield code="u">Center for astrophysics</subfield>
      <subfield code="7">ARXIV</subfield>
        expected = """<collections><collection><record>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Di Milia, Giovanni</subfield>
    <subfield code="b">Di Milia, G</subfield>
    <subfield code="u">Center for astrophysics</subfield>
    <subfield code="7">A&amp;A</subfield>
  <datafield tag="980" ind1="" ind2="">
    <subfield code="a">ASTRONOMY</subfield>
    <subfield code="7">ADS metadata</subfield>
        #records = b.create_records(marcxml)
        expected_record = create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0]
        merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0]
        self.assertTrue(b._compare_fields(merged_record[0]['100'][0], expected_record[0]['100'][0], strict=False))
def extractor_process(q_todo, q_done, q_probl, q_uplfile, lock_stdout, lock_createdfiles, q_life, extraction_directory, extraction_name):
    """Worker function for the extraction of bibcodes from ADS
        it has been defined outside any class because it's more simple to treat with multiprocessing """
    logger.warning(multiprocessing.current_process().name + ' (worker) Process started')
    #I create a local logger
    fh = logging.FileHandler(os.path.join(pipeline_settings.BASE_OUTPUT_PATH, extraction_directory, pipeline_settings.BASE_LOGGING_PATH, multiprocessing.current_process().name+'_worker.log'))
    fmt = logging.Formatter(pipeline_settings.LOGGING_FORMAT)
    local_logger = logging.getLogger(pipeline_settings.LOGGING_WORKER_NAME)
    local_logger.propagate = False
    #I print the same message for the local logger
    local_logger.warning(multiprocessing.current_process().name + ' Process started')
    #I remove the automatic join from the queue of the files to upload
    #I get the maximum number of groups I can process
    max_num_groups = settings.MAX_NUMBER_OF_GROUP_TO_PROCESS
    #variable used to know if I'm exiting because the queue is empty or because I reached the maximum number of groups to process
    queue_empty = False

    #while there is something to process or I reach the maximum number of groups I can process,  I try to process
    for grpnum in range(max_num_groups):

        task_todo = q_todo.get()
        if task_todo[0] == 'STOP':

            queue_empty = True
            #I exit the loop

        #I print when I'm starting the extraction
        local_logger.warning(multiprocessing.current_process().name + (' starting to process group %s' % task_todo[0]))

        #then I process the bibcodes
        # I define a couple of lists where to store the bibcodes processed
        bibcodes_ok = []
        bibcodes_probl = []

        #I define a ADSEXPORT object
        recs = ADSRecords('full', 'XML')

        # I define a maximum amount of bibcodes I can skip per each cicle: the number of bibcodes per group / 10 (minimum 500)
        # if i skip more than this amount it means that there is something
        # wrong with the access to the data and it's better to stop everything
        max_number_of_bibs_to_skip = max(settings.NUMBER_OF_BIBCODES_PER_GROUP / 10, settings.MAX_SKIPPED_BIBCODES)

        for bibcode in task_todo[1]:
            except Exception, error:
                local_logger.error(': problem retrieving the bibcode "%s" in group %s' % (bibcode, task_todo[0]))
                #I catch the exception type name
                exc_type, exc_obj, exc_tb = sys.exc_info()
                    str_error_to_print = exc_type.__name__ + '\t' + str(error)
                        str_error_to_print = u'%s\t%s' % (unicode(exc_type.__name__), unicode(error))
                        local_logger.error(' Cannot log error for bibcode %s ' % bibcode)
                        str_error_to_print = ''
                bibcodes_probl.append((bibcode, str_error_to_print))
                max_number_of_bibs_to_skip = max_number_of_bibs_to_skip - 1
            #If i=I reach 0 It means that I skipped 1k bibcodes and probably there is a problem: so I simulate an exit for empty queue
            if max_number_of_bibs_to_skip == 0:
        #I exit from both loops
        if max_number_of_bibs_to_skip == 0:
            local_logger.warning(' Detected possible error with ADS data access: skipped %s bibcodes in one group' % max(settings.NUMBER_OF_BIBCODES_PER_GROUP / 10, settings.MAX_SKIPPED_BIBCODES))
            queue_empty = True

        #I extract the object I created
        xmlobj = recs.export()
        del recs

            #I define a transformation object
            transf = xml_transformer.XmlTransformer(local_logger)
            #and I transform my object
            marcxml = transf.transform(xmlobj)
            err_msg = ' Impossible to transform the XML!'
            raise GenericError(err_msg)

        if marcxml:
            #I merge the records
            merged_records, records_with_merging_probl = merger.merge_records_xml(marcxml)
            #If I had problems to merge some records I remove the bibcodes from the list "bibcodes_ok" and I add them to "bibcodes_probl"
            for elem in records_with_merging_probl:
                except ValueError:
                    local_logger.warning(' Problems to remove bibcode "%s" in group "%s" from the list of bibcodes extracted after merging' % (elem[0], task_todo[0]) )
                    if elem[0] in bibcodes_probl:
                        local_logger.error(': bibcode "%s" reached the merger but was in problematic bibcodes!' % elem[0])
            bibcodes_probl = bibcodes_probl + records_with_merging_probl
            #I write the object in a file
            filepath = os.path.join(settings.BASE_OUTPUT_PATH, extraction_directory, pipeline_settings.BASE_BIBRECORD_FILES_DIR, pipeline_settings.BIBREC_FILE_BASE_NAME+'_'+extraction_name+'_'+task_todo[0])
            output = open(filepath, 'wb')
            pickle.dump(merged_records, output)
            #then I write the filepath to a file for eventual future recovery
            bibrec_file_obj = open(os.path.join(settings.BASE_OUTPUT_PATH, extraction_directory,settings.LIST_BIBREC_CREATED), 'a')
            bibrec_file_obj.write(filepath + '\n')
            #finally I append the file to the queue
  'Insert in queue for upload the file "%s" of the group "%s" ' % (filepath, task_todo[0]))
  'record created, merged but not uploaded')
            #bibupload_merger(merged_records, local_logger, 'replace_or_insert')
        #otherwise I put all the bibcodes in the problematic
            bibcodes_probl = bibcodes_probl + [(bib, 'Bibcode extraction ok, but xml generation failed') for bib in bibcodes_ok]
            bibcodes_ok = []
        #finally I pass to the done bibcodes to the proper file
        q_done.put([task_todo[0], bibcodes_ok])
        #and the problematic bibcodes
        q_probl.put([task_todo[0], bibcodes_probl])

        local_logger.warning(multiprocessing.current_process().name + (' finished to process group %s' % task_todo[0]))
    def test_01_merge_two_records_one_field(self):
        AUTHORS: 2 records, priority.
        marcxml = """<collections><collection>
    <datafield tag="100" ind1=" " ind2=" ">
      <subfield code="a">Di Milia, Giovanni</subfield>
      <subfield code="b">Di Milia, G</subfield>
      <subfield code="7">A&amp;A</subfield>
    <datafield tag="700" ind1=" " ind2=" ">
      <subfield code="a">Luker, Jay</subfield>
      <subfield code="b">Luker, J</subfield>
      <subfield code="7">A&amp;A</subfield>
    <datafield tag="700" ind1=" " ind2=" ">
      <subfield code="a">Henneken, Edwin</subfield>
      <subfield code="b">Henneken, E</subfield>
      <subfield code="7">A&amp;A</subfield>
    <datafield tag="980" ind1="" ind2="">
      <subfield code="a">ASTRONOMY</subfield>
      <subfield code="7">ADS metadata</subfield>
    <datafield tag="100" ind1=" " ind2=" ">
      <subfield code="a">Dimilia, Giovanni</subfield>
      <subfield code="b">Dimilia, G</subfield>
      <subfield code="7">ARXIV</subfield>
    <datafield tag="700" ind1=" " ind2=" ">
      <subfield code="a">Luker, Jay</subfield>
      <subfield code="b">Luker, J</subfield>
      <subfield code="7">ARXIV</subfield>
    <datafield tag="700" ind1=" " ind2=" ">
      <subfield code="a">Henneken, Edwin</subfield>
      <subfield code="b">Henneken, E</subfield>
      <subfield code="7">ARXIV</subfield>
        expected = """<collections><collection><record>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Di Milia, Giovanni</subfield>
    <subfield code="b">Di Milia, G</subfield>
    <subfield code="7">A&amp;A</subfield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="a">Luker, Jay</subfield>
    <subfield code="b">Luker, J</subfield>
    <subfield code="7">A&amp;A</subfield>
  <datafield tag="700" ind1=" " ind2=" ">
    <subfield code="a">Henneken, Edwin</subfield>
    <subfield code="b">Henneken, E</subfield>
    <subfield code="7">A&amp;A</subfield>
  <datafield tag="980" ind1="" ind2="">
    <subfield code="a">ASTRONOMY</subfield>
    <subfield code="7">ADS metadata</subfield>
        merged_record = m.merge_records_xml(libxml2.parseDoc(marcxml))[0]
        self.assertEqual(merged_record, create_record_from_libxml_obj(libxml2.parseDoc(expected), logger)[0])