예제 #1
0
 def test_07_components_no_blanks(self):
     for lccn in lccns:
         expected = lccns[lccn]
         comps = callnumber.LC(lccn).components()
         self.assertTrue(lccn)
         self.assertEqual(len(expected), len(comps))
         self.assertEqual(expected, comps)
예제 #2
0
 def test_08_components_no_blanks(self):
     for lccn in lccns_with_blanks:
         expected = lccns_with_blanks[lccn]
         comps = callnumber.LC(lccn).components(include_blanks=True)
         self.assertTrue(lccn)
         self.assertEqual(len(expected), len(comps))
         self.assertEqual(expected, comps)
예제 #3
0
def process(addons=None):

    if addons:
        # Transform list into XML file
        root = ET.Element('printout')

        for i, item in enumerate(addons):
            temp = ET.Element('ROW')
            child = ET.Element('BSN')
            child.text = item
            temp.append(child)
            child = ET.Element('BARCODE')
            child.text = str(i)
            temp.append(child)
            root.append(temp)

            # pretty string
            xmlstr = prettify_xml(ET.tostring(root))

            # Write append record to xml file
            with open('app/data/tmp/append_bsns.xml', 'w') as f:
                f.write(xmlstr)

            # Delete infile?

    # Combine xml NT report with append
    # File should be named report.xml
    # Make an argument?
    process_infile = f'app/data/in/{in_file}'
    process_tmp = 'app/data/tmp/report.xml'
    copyfile(process_infile, process_tmp)

    combined_xml = combine_xml('app/data/tmp/')
    xmlstr = prettify_xml(combined_xml)

    process_outfile = 'app/data/out/full_report.xml'

    with open(process_outfile, "w") as f:
        f.write(xmlstr)

    with open(process_outfile) as f:
        doc = xmltodict.parse(f.read())

    # Logging?
    print('There are {} records in this month\'s report.'.format(len(doc['printout']['ROW'])))

    report = []

    for row in doc['printout']['ROW']:
        item = {}
        item['barcode'] = row['BARCODE']
        item['bsn'] = row['BSN']
        if 'VOLUME_INFO' in row.keys():
            item['volume'] = row['VOLUME_INFO']
            if '(' in item['volume']:
                item['volume'] = item['volume'].replace('(',' (')

        if 'Z13_IMPRINT' in row.keys():
            item['imprint'] = row['Z13_IMPRINT']

        report.append(item)

    barcodes = [item['barcode'] for item in report]
    bsns = [item['bsn'] for item in report]
    # pprint(list(zip(barcodes, bsns)))

    # Move to newtitles.py
    # http://stackoverflow.com/a/3308844

    import unicodedata as ud

    latin_letters= {}

    def is_latin(uchr):
        try: return latin_letters[uchr]
        except KeyError:
             return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

    def only_roman_chars(unistr):
        return all(is_latin(uchr)
               for uchr in unistr
               if uchr.isalpha()) # isalpha suggested by John Machin

    def check_bsn(bsn):
        urlstring = '%s%s' % (os.getenv('LIBRARY_API'), bsn)
        url = urllib.request.urlopen(urlstring)
        tree = ET.parse(url)
        root = tree.getroot()
        check = root.findall(".//{http://www.openarchives.org/OAI/2.0/}metadata")
        return True if check else False

    records = []
    processed = 0
    successes = 0

    for i, barcode in enumerate(barcodes):
        bc_index = barcodes.index(barcode)

        bsn = report[bc_index]['bsn']
        processed += 1


        if check_bsn(bsn):
            successes += 1

            new_title = NewTitle(bsn) # API call already made in check--capture that information so there is no need to make second call?

            #print("Processing record %d: %s" % (i+1, bsn))

            record = {}
            record['bsn'] = bsn
            record['title'] = new_title.format_title()
            record['char'] = only_roman_chars(record['title'])
            record['contributor'] = new_title.format_contributor()
            record['edition'] = new_title.format_edition()

            if 'imprint' in report[bc_index].keys():
                record['imprint'] = report[bc_index]['imprint'].strip()
                record['imprint'] = record['imprint'][:-1] if record['imprint'][-1] == '.' else record['imprint']
            else:
                record['imprint'] = new_title.format_imprint()

            record['imprint'] = new_title.format_imprint()
            record['collection'] = new_title.format_collection()
            record['series'] = new_title.format_series()

            if 'volume' in report[bc_index].keys():
                record['volume'] = report[bc_index]['volume'].replace('.', '. ')
            else:
                record['volume'] = ""

            # FIX!
            record['callnumber'] = new_title.format_callnumber()
            if record['callnumber']:
                record['lccn'] = callnumber.LC(record['callnumber']).normalized
            else:
                record['lccn'] = "Call number missing"

            if record['lccn'] == None:
                record['lccn'] = record['callnumber'].strip().title()

            if record['volume']:
                if record['callnumber']:
                    record['callnumber'] += " " + record['volume']

            record['gift'] = new_title.format_gift()
            record['handle'] = new_title.format_handle()

            records.append(record)
        else:
            print(f'{bsn} is an invalid BSN. Skipping record...')

    print('\nFinished processing %d records with %d successes.' % (processed, successes))


    ## Choose category using call number map

    with open('app/data/ref/lc_classes.csv', 'r') as f:
      reader = csv.reader(f)
      lc_classes = list(reader)

    for i, record in enumerate(records):
        #print(i, record['title'], record['callnumber'])
        record['category'] = 'other'
        if record['callnumber']:
            cn = callnumber.LC(record['callnumber'])
            cn_split = cn.components()
            #print(cn_split)
            if cn_split:
                if len(cn_split) > 1:
                    if cn_split[0] in [item[0] for item in lc_classes]:
                        #print('Yes')
                        rows = [item for item in lc_classes if cn_split[0]==item[0]]
                        for row in rows:
                            #print(row)
                            if float(row[1]) <= float(cn_split[1]) <= float(row[2]):
                                #print(float(row[1]) <= float(cn_split[1]) <= float(row[2]))
                                record['category'] = row[3]
                                #print('Updated!')
                                break
            else:
                print(record['title'], record['lccn'])

    ## Guess category

    from app.categorize_nt import predict_categories
    # ^^^ Can put any categorization algorithm into this module

    titles = [record['title'] for record in records]

    predicted_categories = predict_categories(titles)
    for i, category in enumerate(predicted_categories):
        if records[i]['category'] == 'other':
            records[i]['title'] = "*"+records[i]['title']
            records[i]['category'] = category

    records = sorted(records, key=lambda k: (k['lccn'], int(''.join(list(filter(str.isdigit, "0"+ k['volume']))))))

    with open('app/data/ref/newtitles.p', 'wb') as f:
        pickle.dump(records, f)
예제 #4
0
 def test_06_start_of_range_equivalence(self):
     for lccn in lccns:
         lccn = callnumber.LC(lccn)
         self.assertTrue(lccn.normalized, lccn.range_start)
예제 #5
0
 def test_05_compound_range(self):
     lccn = callnumber.LC('A11.1')
     self.assertTrue(lccn.range_start, 'A  001110')
     self.assertTrue(lccn.range_end, 'A  001119~999~999~999')
예제 #6
0
 def test_04_simple_range(self):
     lccn = callnumber.LC('A')
     self.assertTrue(lccn.range_start, 'A')
     self.assertTrue(lccn.range_end, 'A~~')
예제 #7
0
 def test_01_compound_normalization(self):
     lccn = callnumber.LC('A11.1')
     self.assertTrue(lccn.denormalized, 'A11.1')
     self.assertTrue(lccn.normalized, 'A  001110')
예제 #8
0
 def test_00_simple_normalization(self):
     lccn = callnumber.LC('A')
     self.assertTrue(lccn.denormalized, 'A')
     self.assertTrue(lccn.normalized, 'A')