def elem_856(symbol): rc=ET.Element("record") for xfile in File.find_by_identifier(Identifier('symbol', symbol)): df = ET.SubElement(rc,"datafield") df.set("tag","856") df.set("ind1","4") df.set("ind2","0") sf_y=ET.SubElement(df,"subfield") sf_y.set("code",'y') sf_y.text=''.join(xfile.languages) sf_9=ET.SubElement(df,"subfield") sf_9.set("code",'9') sf_9.text=str(xfile.id) sf_s=ET.SubElement(df,"subfield") sf_s.set("code",'s') sf_s.text=str(xfile.size) sf_u=ET.SubElement(df,"subfield") sf_u.set("code",'u') sf_u.text=str(xfile.uri) #return ET.tostring(rc) return rc
def show_pdf(path): xfile = File.latest_by_identifier_language(Identifier('symbol', path), 'EN') print(xfile.uri) return (render_template('test1.html', symbol=path, uri='http://' + xfile.uri))
def db(): from dlx import DB from dlx.marc import Bib, Auth from dlx.file import S3, File, Identifier from tempfile import TemporaryFile DB.connect( 'mongomock://localhost') # ? does mock connection create a fresh db ? DB.bibs.drop() DB.auths.drop() DB.files.drop() DB.handle['dlx_dl_log'].drop() Auth().set('100', 'a', 'name_1').commit() Auth().set('100', 'a', 'name_2').commit() Bib().set('191', 'a', 'TEST/1').set('245', 'a', 'title_1').set('700', 'a', 1).commit() Bib().set('245', 'a', 'title_2').set('700', 'a', 2).commit() S3.connect(access_key='key', access_key_id='key_id', bucket='mock_bucket') S3.client.create_bucket(Bucket=S3.bucket) handle = TemporaryFile() handle.write(b'some data') handle.seek(0) File.import_from_handle(handle, filename='', identifiers=[Identifier('symbol', 'TEST/1')], languages=['EN'], mimetype='text/plain', source='test') return DB.client
def upload(fh, data): symbols = [data['symbol1']] if data['symbol2'] and not data['symbol2'].isspace(): symbols.append(data['symbol2']) if any([re.search(r'JOURNAL', x) for x in symbols]): return identifiers = [Identifier('symbol', x) for x in filter(None, symbols)] lang = {'A': 'AR', 'C': 'ZH', 'E': 'EN', 'F': 'FR', 'R': 'RU', 'S': 'ES', 'G': 'DE'}[data['languageId']] if args.language and lang != args.language.upper(): return languages = [lang] overwrite = True if args.overwrite else False try: return File.import_from_handle( fh, filename=encode_fn(list(filter(None, symbols)), languages[0], 'pdf'), identifiers=identifiers, languages=languages, mimetype='application/pdf', source='gdoc-dlx-' + args.station, overwrite=overwrite ) except FileExistsConflict as e: print(json.dumps({'warning': e.message, 'data': {'symbols': symbols, 'language': languages}})) except FileExists: print(json.dumps({'info': 'Already in the system', 'data': {'symbols': symbols, 'language': languages}})) except Exception as e: print(json.dumps({'error': '; '.join(re.split('[\r\n]', str(e))), 'data': {'symbols': symbols, 'languages': languages}})) raise e
def _fft_from_files(bib): symbols = bib.get_values('191', 'a') + bib.get_values('191', 'z') seen = [] for symbol in set(symbols): if symbol == '' or symbol == ' ' or symbol == '***': # note: clean these up in db continue for lang in ('AR', 'ZH', 'EN', 'FR', 'RU', 'ES', 'DE'): xfile = File.latest_by_identifier_language( Identifier('symbol', symbol), lang) if xfile and lang not in seen: field = Datafield(record_type='bib', tag='FFT', ind1=' ', ind2=' ') field.set('a', 'https://' + xfile.uri) field.set('d', ISO_STR[lang]) field.set( 'n', encode_fn(symbols if len(symbols) <= 3 else symbols[0:1], lang, 'pdf')) bib.fields.append(field) seen.append(lang) return bib
def test_import_from_binary(db, s3): from io import BytesIO from dlx import Config, DB from dlx.file import File, Identifier, S3 S3.client.create_bucket(Bucket=S3.bucket) # this should be only necessary for testing control = 'eb733a00c0c9d336e65691a37ab54293' assert File.import_from_binary(b'test data', identifiers=[Identifier('isbn', '1')], filename='fn.ext', languages=['EN'], mimetype='application/dlx', source='test') == control
def test_import_from_url(db, s3): import requests from http.server import HTTPServer, BaseHTTPRequestHandler from io import BytesIO from dlx import Config, DB from dlx.file import File, Identifier, S3 S3.client.create_bucket(Bucket=S3.bucket) # this should be only necessary for testing server = HTTPServer(('127.0.0.1', 9090), None) responses.add(responses.GET, 'http://127.0.0.1:9090', body=BytesIO(b'test data').read()) control = 'eb733a00c0c9d336e65691a37ab54293' assert File.import_from_url(url='http://127.0.0.1:9090', identifiers=[Identifier('isbn', '3')], filename='test', languages=['EN'], mimetype='test', source=None) == control
def test_import_from_path(db, s3): from tempfile import NamedTemporaryFile from dlx import Config, DB from dlx.file import S3, File, Identifier S3.client.create_bucket(Bucket=S3.bucket) # this should be only necessary for testing fh = NamedTemporaryFile() fh.write(b'test data') fh.seek(0) path = fh.name control = 'eb733a00c0c9d336e65691a37ab54293' assert File.import_from_path(path, identifiers=[Identifier('isbn', '1')], filename='fn.ext', languages=['EN'], mimetype='application/dlx', source='test') == control
def test_import_from_handle(db, s3): from tempfile import TemporaryFile from dlx import Config, DB from dlx.file import S3, File, Identifier, FileExists, FileExistsIdentifierConflict, FileExistsLanguageConflict S3.client.create_bucket(Bucket=S3.bucket) # this should be only necessary for testing handle = TemporaryFile() handle.write(b'some data') handle.seek(0) File.import_from_handle(handle, identifiers=[Identifier('isbn', '1')], filename='fn.ext', languages=['EN'], mimetype='application/dlx', source='test') results = list(DB.files.find({'identifiers': {'type': 'isbn', 'value': '1'}})) assert(len(results)) == 1 assert results[0]['filename'] == 'fn.ext' assert results[0]['languages'] == ['EN'] assert results[0]['mimetype'] == 'application/dlx' assert results[0]['source'] == 'test' assert results[0]['uri'] == '{}.s3.amazonaws.com/{}'.format(S3.bucket, results[0]['_id']) with TemporaryFile() as fh: S3.client.download_fileobj(S3.bucket, results[0]['_id'], fh) fh.seek(0) assert fh.read() == b'some data' with pytest.raises(FileExistsIdentifierConflict): handle = TemporaryFile() handle.write(b'some data') handle.seek(0) File.import_from_handle(handle, identifiers=[Identifier('isbn', '2')], filename='test', languages=['FR'], mimetype='test', source=None ) with pytest.raises(FileExistsLanguageConflict): handle = TemporaryFile() handle.write(b'some data') handle.seek(0) File.import_from_handle(handle, identifiers=[Identifier('isbn', '1')], filename='test', languages=['FR'], mimetype='test', source=None)
langs = row[3].split(',') for l in langs: lang.append(LANGS[l]) else: try: lang = [LANGS[row[3]]] except KeyError: print( f"LanguageError: Unable to determine language for {filename} and {symbol}. This file won't be imported." ) break ext = filename.split('.')[-1] encoded_filename = encode_fn(symbol, lang, ext) identifiers = [] identifiers.append(Identifier('symbol', symbol)) #key = "{}/{}/PDF/{}".format(base_path, subfolder, filename) print(encoded_filename) save_file = "{}/{}".format(tmpdir, filename) if args.skipdb: key = f"{base_path}/{subfolder}/{filename}" else: table = dynamodb.Table(args.table) # Use the filename to query the DigitizationIndex response = table.query( IndexName=args.index, KeyConditionExpression=Key('filename').eq(filename)) #print(response)
def run(): args = get_args() DLX.connect(args.dlx_connect) S3.connect(bucket=args.s3_bucket) symbols = [args.symbol] if args.symbol else [ re.split('\t', x)[0].strip() for x in open(args.list).readlines() ] langs = [args.language] if args.language else LANG.keys() for sym in symbols: bib = Bib.from_query(Query( Or(Condition('191', {'a': sym}), Condition('191', {'z': sym}))), collation=Collation(locale='en', strength=2)) if not bib and not args.skip_check: logging.warning(f'Bib for document {sym} not found. Skipping.') continue elif bib and not args.skip_check: # capture symbols from the bib record (exclude those beginning with brackets) ids = list( filter( lambda x: x[0] != '[', (bib.get_values('191', 'a') + bib.get_values('191', 'z')))) else: logging.warning( f'Bib for document {sym} not found with --skip_check enabled. Using {sym} as identifier' ) ids = symbols for lang in langs: logging.info(f'Getting {sym} {lang} ...') try: fh = ODS.download( sym if not args.ods_symbol else args.ods_symbol, lang) except FileNotFound: logging.warning(f'{sym} {lang} not found in ODS') continue except Exception as e: logging.warning(e) continue isolang = LANG[lang] try: result = File.import_from_handle( fh, filename=File.encode_fn(sym, isolang, 'pdf'), identifiers=[Identifier('symbol', s) for s in ids], languages=[isolang], mimetype='application/pdf', source='ods-importx', overwrite=args.overwrite) logging.info(f'OK - {result.id}') except FileExistsLanguageConflict as e: logging.warning(f'{e.message} X {isolang}') except FileExistsIdentifierConflict as e: logging.warning(f'{e.message} X {ids}') except FileExists: logging.info('Already in the system') except: raise