Beispiel #1
0
def test_from_query(db):
    from dlx.marc import MarcSet, BibSet, AuthSet, QueryDocument, Condition
    
    bibset = BibSet.from_query({'_id': {'$in': [1, 2]}})
    assert isinstance(bibset, (MarcSet, BibSet))
    assert bibset.count == 2
    assert isinstance(bibset.records, map)
    bibset.cache()
    assert isinstance(bibset.records, list)
    
    bibset = BibSet.from_query({}, skip=0, limit=1)
    assert bibset.count == 1
    for bib in bibset:
        assert bib.id == 1
    assert len(list(bibset.records)) == 0
    assert bibset.count == 1
    
    conditions = [
        Condition(tag='150', subfields={'a': 'Header'}),
        Condition(tag='200', modifier='not_exists')
    ]
    authset = AuthSet.from_query(conditions)
    assert isinstance(authset, (MarcSet, AuthSet))
    assert authset.count == 1
    assert isinstance(authset.records, map)
    authset.cache()
    assert isinstance(authset.records, list)
    
    query = QueryDocument(
        Condition('245', modifier='exists')
        
    )    
    bibset = BibSet.from_query(query)
    assert isinstance(bibset, BibSet)
    assert bibset.count == 2
Beispiel #2
0
def date_unbis(date):
    '''
    outputs records in native central DB schema json format for the date which is provided as a dynamic route inputed in YYYYMMDD or YYYY-MM-DD
    e.g. /YYYY-MM-DD/json
    e.g. /YYYYMMDD/json?skip=n&limit=m
    skip=n URL parameter is used to skip n records. Default is 0.
    limit=m URL parameter is used to limit number of records returned. Default is 50.
    if the date is in wrong format the function returns today's records
    it uses DLX's bibset.to_json serialization function to output json
    '''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    #print(f"skip is {skp} and limit is {limt}")
    str_date = date.replace('-', '')
    print(f"the original str_date is {str_date}")
    #if len(str_date)!= 8:
    #date = datetime.datetime.now()
    #str_date=str(date.year)+str(date.month)+str(date.day)
    print(f"the str_date is {str_date}")
    query = QueryDocument(
        Condition(tag='998', subfields={'z': re.compile('^' + str_date)}),
        Condition(tag='035', subfields={'a': re.compile('^T')}))

    #print(query.to_json())
    '''
    authset = AuthSet.from_query(query, projection={'035':1,'150':1}, skip=skp, limit=limt)
    unbis=authset.to_xml()
    return Response(unbis, mimetype='text/xml')
    '''
    dict1 = {}
    authset = AuthSet.from_query(query,
                                 projection={
                                     '035': 1,
                                     '150': 1
                                 },
                                 skip=skp,
                                 limit=limt)
    for auth in authset:
        val_035a = auth.get_values('035', 'a')
        #print(f"035 values are: {val_035a}")
        val_035a = ''.join([str for str in val_035a if str[0] == 'T'])
        #dict1[auth.get_value('035','a')]=auth.get_value('150','a')
        dict1[val_035a] = auth.get_value('150', 'a')
        #dict1['FR']=auth.get_value('993','a')
    #unbis=authset.to_xml()
    #return Response(unbis, mimetype='text/xml')
    return jsonify(dict1)
Beispiel #3
0
def xml(date):
    '''
outputs records in MARCXML format for the date which is provided as a dynamic route in YYYYMMDD or YYYY-MM-DD formats
/YYYYMMDD/xml?skip=n&limit=m
skip=n URL parameter is used to skip n records. Default is 0.
limit=m URL parameter is used to limit number of records returned. Default is 50.
if the date is in wrong format the function returns today's records
it uses DLX bibset.to_xml serialization function to output MARCXML
'''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    print(f"skip is {skp} and limit is {limt}")
    str_date = date.replace('-', '')
    print(f"the original str_date is {str_date}")
    if len(str_date) != 8:
        #date = datetime.datetime.now()
        str_date = str(date.year) + str(date.month) + str(date.day)
    print(f"the str_date is {str_date}")
    query = QueryDocument(
        Condition(tag='998', subfields={'z': re.compile('^' + str_date)}),
        Condition(tag='029', subfields={'a': 'JN'}))
    print(query.to_json())
    start_time = datetime.now()
    bibset = BibSet.from_query(query,
                               projection={
                                   '029': 1,
                                   '091': 1,
                                   '191': 1,
                                   '245': 1,
                                   '269': 1,
                                   '650': 1,
                                   '991': 1
                               },
                               skip=skp,
                               limit=limt)
    print(f"duration for 998z was {datetime.now()-start_time}")
    start_time_xml = datetime.now()
    xml = bibset.to_xml()

    #removing double space from the xml; creates pbs with the job number on ODS export
    xml = xml.replace("  ", " ")
    print(
        f"duration for xml serialization was {datetime.now()-start_time_xml}")
    return Response(xml, mimetype='text/xml')
Beispiel #4
0
def show_xml856(path):
    query = QueryDocument(
        Condition(
            tag='191',
            #subfields={'a': re.compile('^'+path+'$')}
            subfields={'a': path}))
    #print(f" the imp query is  -- {query.to_json()}")
    ts2 = time.time()
    bibset = BibSet.from_query(query,
                               projection={
                                   '029': 1,
                                   '091': 1,
                                   '191': 1,
                                   '245': 1,
                                   '269': 1,
                                   '650': 1,
                                   '991': 1
                               })
    #add856 # this is where we isnert 856 tags for files info
    print(f"time for query is {time.time()-ts2}")
    ts3 = time.time()
    xml = add856(bibset)
    print(f"total time for adding 856 is {time.time()-ts3}")
    #xml=bibset.to_xml()
    #decoding to string and emoving double space from the xml; creates pbs with the job number on ODS export
    xml = xml.decode("utf-8").replace("  ", " ")
    return Response(xml, mimetype='text/xml')
Beispiel #5
0
def unbis():
    '''
    outputs UNBIS thesaurus subject heading records in MARCXML format /unbis?skip=n&limit=m
    skip=n URL parameter is used to skip n records. Default is 0.
    limit=m URL parameter is used to limit number of records returned. Default is 50.
    it uses DLX bibset.to_xml serialization function to output fields 035 and 150 in MARCXML
    '''

    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    print(f"skip is {skp} and limit is {limt}")
    query = QueryDocument(
        Condition(tag='035', subfields={'a': re.compile('^T')}))
    print(query.to_json())
    authset = AuthSet.from_query(query,
                                 projection={
                                     '035': 1,
                                     '150': 1
                                 },
                                 skip=skp,
                                 limit=limt)
    unbis = authset.to_xml()
    return Response(unbis, mimetype='text/xml')
Beispiel #6
0
def jsonf(date):
    '''
    outputs records in native central DB schema json format for the date which is provided as a dynamic route inputed in YYYYMMDD or YYYY-MM-DD
    e.g. /YYYY-MM-DD/json
    e.g. /YYYYMMDD/json?skip=n&limit=m
    skip=n URL parameter is used to skip n records. Default is 0.
    limit=m URL parameter is used to limit number of records returned. Default is 50.
    if the date is in wrong format the function returns today's records
    it uses DLX's bibset.to_json serialization function to output json
    '''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    print(f"skip is {skp} and limit is {limt}")
    str_date = date.replace('-', '')
    print(f"the original str_date is {str_date}")
    if len(str_date) != 8:
        date = datetime.datetime.now()
        str_date = str(date.year) + str(date.month) + str(date.day)
    print(f"the str_date is {str_date}")
    query = QueryDocument(
        Condition(tag='998', subfields={'z': re.compile('^' + str_date)}),
        Condition(tag='029', subfields={'a': 'JN'}))

    bibset = BibSet.from_query(query,
                               projection={
                                   '029': 1,
                                   '091': 1,
                                   '191': 1,
                                   '245': 1,
                                   '269': 1,
                                   '650': 1,
                                   '991': 1,
                                   '998': 1
                               },
                               skip=skp,
                               limit=limt)

    jsonl = []
    for bib in bibset.records:
        jsonl.append(bib.to_json())
    return jsonify(jsonl)
Beispiel #7
0
def symbols(date):
    '''
    outputs records in txt format for the date which is provided as a dynamic route in YYYYMMDD or YYYY-MM-DD formats
    e.g. /YYYYMMDD/symbols /YYYY-MM-DD/symbols?skip=n&limit=m
    skip=n URL parameter is used to skip n records. Default is 0.
    limit=m URL parameter is used to limit number of records returned. Default is 50.
    if the date is in wrong format the function returns today's records
    it uses DLX bibset.to_txt serialization function to output MARCXML
    '''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    print(f"skip is {skp} and limit is {limt}")
    str_date = date.replace('-', '')
    print(f"the original str_date is {str_date}")
    if len(str_date) != 8:
        date = datetime.datetime.now()
        str_date = str(date.year) + str(date.month) + str(date.day)
    print(f"the str_date is {str_date}")

    query = QueryDocument(
        Condition(tag='998', subfields={'z': re.compile('^' + str_date)}),
        Condition(tag='029', subfields={'a': 'JN'}))

    bibset = BibSet.from_query(query,
                               projection={
                                   '029': 1,
                                   '191': 1
                               },
                               skip=skp,
                               limit=limt)

    str_out = ''
    for bib in bibset.records:
        str_out += bib.to_str()
    return Response(str_out, mimetype='text/plain')
Beispiel #8
0
def test_querydocument(db):
    from dlx.marc import Bib, Auth, QueryDocument, Condition, Or
    from bson import SON
    from json import loads
    import re
    
    query = QueryDocument(Condition(tag='245', subfields={'a': 'This'}))
    assert isinstance(query.compile(), SON)
    
    qjson = query.to_json()
    qdict = loads(qjson)
    assert qdict['245']['$elemMatch']['subfields']['$elemMatch']['code'] == 'a'
    assert qdict['245']['$elemMatch']['subfields']['$elemMatch']['value'] == 'This'
    
    query = QueryDocument(
        Condition(tag='245', subfields={'a': re.compile(r'(This|Another)'), 'b': 'is the', 'c': 'title'}),
        Condition(tag='650', modifier='exists'),
        Or(
            Condition(tag='710', modifier='exists'),
            Condition(tag='520', modifier='not_exists')
        )
    )
    assert len(list(Bib.find(query.compile()))) == 2
    
    query = QueryDocument(
        Condition(tag='110', subfields={'a': 'Another header'}),
    )
    assert len(list(Auth.find(query.compile()))) == 1
    assert Auth.find_one(query.compile()).id == 2
 def fetch_auth_data(self,proj_auth_dict):
     match_criteria=fetch_agenda(self.body,self.session)
     query_auth = QueryDocument(
             Condition(
             tag='191',
             #subfields={'a': re.compile('^'+self.body+'/'+self.session[0:4])}
             subfields={'a': match_criteria}
                 )
         )
     #print(query.to_json())
     authset=AuthSet.from_query(query_auth, projection=proj_auth_dict, skip=0, limit=0)
     lauths=list(authset.records)
     print(f"authset length is : {len(lauths)}")
     return lauths#, l_temp
Beispiel #10
0
def unbis_tcode(tcode):
    '''
    looks up UNBIS thesaurus T codes and returns matching subject heading records 
    skip=n URL parameter is used to skip n records. Default is 0.
    limit=m URL parameter is used to limit number of records returned. Default is 50.
    it uses DLX bibset.to_xml serialization function to output fields 035 and 150 in MARCXML
    '''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    #print(f"skip is {skp} and limit is {limt}")
    query = QueryDocument(
        Condition(tag='035', subfields={'a': re.compile(str(tcode).upper())}))
    print(query.to_json())
    dict1 = {}
    authset = AuthSet.from_query(query,
                                 projection={
                                     '035': 1,
                                     '150': 1,
                                     '993': 1,
                                     '994': 1,
                                     '995': 1,
                                     '996': 1,
                                     '997': 1
                                 },
                                 skip=skp,
                                 limit=limt)
    for auth in authset:
        val_035a = auth.get_values('035', 'a')
        #print(f"035 values are: {val_035a}")
        val_035a = ''.join([str for str in val_035a if str[0] == 'T'])
        #dict1[auth.get_value('035','a')]=auth.get_value('150','a')
        dict1[val_035a] = {
            'EN': auth.get_value('150', 'a'),
            'FR': auth.get_value('993', 'a'),
            'ES': auth.get_value('994', 'a'),
            'AR': auth.get_value('995', 'a'),
            'ZH': auth.get_value('996', 'a'),
            'RU': auth.get_value('997', 'a')
        }
        #dict1['FR']=auth.get_value('993','a')
    #unbis=authset.to_xml()
    #return Response(unbis, mimetype='text/xml')
    return jsonify(dict1)
 def fetch_bib_data(self,proj_dict):
     query = QueryDocument(
         Or(
             Condition(
             tag='191',
             subfields={'b': self.body+'/','c':self.session}
                 ),
             Condition(
             tag='791',
             subfields={'b': self.body+'/','c':self.session}
                 ),
             Condition(
             tag='930',
             subfields={'a': 'ITP'+self.body+self.session}
                 )
             )
         )
     #print(query.to_json())
     bibset=BibSet.from_query(query, projection=proj_dict, skip=0, limit=0)
     #l_temp=bibset.count
     #self.snapshot_len=l_temp 
     lbibs=list(bibset.records)
     print(f"bibset length is : {len(lbibs)}")
     return lbibs#, l_temp
Beispiel #12
0
def unbis_label(label):
    '''
    looks up UNBIS thesaurus labels and returns matching T codes 
    skip=n URL parameter is used to skip n records. Default is 0.
    limit=m URL parameter is used to limit number of records returned. Default is 50.
    it uses DLX authset to output fields 035 and 150
    '''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    print(f"skip is {skp} and limit is {limt}")
    query = QueryDocument(
        Condition(tag='150', subfields={'a': re.compile(str(label).upper())}))
    print(query.to_json())
    dict1 = {}
    authset = AuthSet.from_query(query,
                                 projection={
                                     '035': 1,
                                     '150': 1
                                 },
                                 skip=skp,
                                 limit=limt)
    '''
    for auth in authset:
        dict1[auth.get_value('150','a')]=auth.get_value('035','a')
    #unbis=authset.to_xml()
    #return Response(unbis, mimetype='text/xml')
    return jsonify(dict1)
    '''

    for auth in authset:
        val_035a = auth.get_values('035', 'a')
        #print(f"035 values are: {val_035a}")
        val_035a = ''.join([str for str in val_035a if str[0] == 'T'])
        #dict1[auth.get_value('035','a')]=auth.get_value('150','a')
        dict1[auth.get_value('150', 'a')] = val_035a
        #dict1['FR']=auth.get_value('993','a')
    #unbis=authset.to_xml()
    #return Response(unbis, mimetype='text/xml')
    return jsonify(dict1)
Beispiel #13
0
def show_xml(path):
    query = QueryDocument(
        Condition(
            tag='191',
            #subfields={'a': re.compile('^'+path+'$')}
            subfields={'a': path}))
    #print(f" the imp query is  -- {query.to_json()}")
    bibset = BibSet.from_query(query,
                               projection={
                                   '029': 1,
                                   '091': 1,
                                   '191': 1,
                                   '245': 1,
                                   '269': 1,
                                   '650': 1,
                                   '856': 1,
                                   '991': 1
                               })
    xml = bibset.to_xml()
    #removing double space from the xml; creates pbs with the job number on ODS export
    xml = xml.replace("  ", " ")
    return Response(xml, mimetype='text/xml')
Beispiel #14
0
def show_symbols(path):
    path = re.escape(path)
    data = ""
    return_data = ""
    query = QueryDocument(
        Condition(
            tag='191',
            subfields={'a': Regex('^' + path)},
        ), )
    print(f" the query is  -- {query.to_json()}")
    bibset = BibSet.from_query(query,
                               projection={'191': True},
                               skip=0,
                               limit=0)
    a_res_en = []
    for bib in bibset.records:
        bib_value = bib.get_value('191', 'a')
        a_res_en.append(bib.get_value('191', 'a'))
    return_data = sorted([quote(doc) for doc in a_res_en],
                         key=lambda x: int(''.join(c for c in x
                                                   if c.isdigit())))
    #return_data=a_res_en
    return (jsonify(return_data))
Beispiel #15
0
def show_txt(path):
    query = QueryDocument(
        Condition(
            tag='191',
            #subfields={'a': re.compile('^'+path+'$')}
            subfields={'a': path}))
    #print(f" the imp query is  -- {query.to_json()}")
    #export_fields={'089':1,'091':1,'191': 1,'239':1,'245':1,'249':1,'260':1,'269':1,'300':1,'500':1,'515':1,'520':1,'596':1,'598':1,'610':1,'611':1,'630:1,''650':1,'651':1,'710':1,'981':1,'989':1,'991':1,'992':1,'993':1,'996':1}
    bibset = BibSet.from_query(query)
    out_list = [('089', 'b'), ('091', 'a'), ('191', 'a'), ('191', 'b'),
                ('191', 'c'), ('191', '9'), ('239', 'a'), ('245', 'a'),
                ('245', 'b'), ('249', 'a'), ('245', 'a'), ('260', 'a'),
                ('260', 'b'), ('260', 'a'), ('260', 'c'), ('269', 'a'),
                ('300', 'a'), ('500', 'a'), ('515', 'a'), ('520', 'a'),
                ('596', 'a'), ('598', 'a'), ('610', 'a'), ('611', 'a'),
                ('630', 'a'), ('650', 'a'), ('651', 'a'), ('710', 'a'),
                ('981', 'a'), ('989', 'a'), ('989', 'b'), ('989', 'c'),
                ('991', 'a'), ('991', 'b'), ('991', 'c'), ('991', 'd'),
                ('992', 'a'), ('993', 'a'), ('996', 'a')]
    #print(f"duration for query was {datetime.now()-start_time_query}")
    jsonl = []

    for bib in bibset.records:
        out_dict = {}
        #start_time_bib=datetime.now()
        for entry in out_list:
            #start_time_field=datetime.now()
            out_dict[entry[0] + '__' + entry[1]] = bib.get_values(
                entry[0], entry[1])
            #print(f"for the field {entry[0]+'__'+entry[1]}")
            #print(f"duration for getting values was {datetime.now()-start_time_field}")
        jsonl.append(out_dict)
        print(f"for the bib {bib.get_values('191','a')}")
        #print(f"duration for getting bib values was {datetime.now()-start_time_bib}")
    #print(f"total duration was {datetime.now()-start_time_all}")
    return jsonify(jsonl)
Beispiel #16
0
def votes(topic):
    '''
    looks up UNBIS thesaurus labels and returns matching T codes ..
    skip=n URL parameter is used to skip n records. Default is 0.
    limit=m URL parameter is used to limit number of records returned. Default is 50.
    it uses DLX authset to output fields 035 and 150
    '''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    try:
        yr_from = request.args.get('year_from')
    except:
        yr_from = "1980"
    try:
        yr_to = request.args.get('year_to')
    except:
        yr_to = '2020'
    try:
        cntry = request.args.get('Country')
    except:
        cntry = 'CANADA'
    try:
        vt = request.args.get('Vote')
    except:
        vt = 'A'

    print(f"skip is {skp} and limit is {limt}")
    print(f"year_from is {yr_from} and year_to is {yr_to}")
    print(f"Country is {cntry}")
    print(f"Vote is {vt}")

    query = QueryDocument(
        Condition(tag='191', subfields={'d': re.compile(str(topic))}),
        Condition(tag='191', subfields={'a': re.compile('^A')}))
    print(query.to_json())
    dict_auth_ids = {}
    authset = AuthSet.from_query(query,
                                 projection={
                                     '001': 1,
                                     '191': 1
                                 },
                                 skip=skp,
                                 limit=limt)
    for auth in authset:
        dict_auth_ids[auth.get_value('191', 'a')] = auth.get_value('001')
    #unbis=authset.to_xml()
    #return Response(unbis, mimetype='text/xml')
    #return jsonify(dict_auth_ids)
    dict_bibs = {}
    str_bibs = ''
    votecountry = ''
    for key, value in dict_auth_ids.items():
        #sample_id=int(dict_auth_ids['A/74/251'])
        print(f"the id of {key} is {value}")
        query_bib = QueryDocument(
            Condition(tag='991', subfields={'d': int(value)}),
            Condition(tag='989',
                      subfields={'a': re.compile(str('Voting Data'))}))

        print(query_bib.to_json())
        bibset = BibSet.from_query(query_bib,
                                   projection={
                                       '001': 1,
                                       '791': 1,
                                       '967': 1
                                   },
                                   skip=skp,
                                   limit=limt)
        for bib in bibset:
            for field in bib.get_fields('967'):
                votecountry = field.get_value("d") + field.get_value("e")
                #print(f'Country+Vote: {votecountry}')
                if str(votecountry) == str(vt) + str(
                        cntry
                ):  # for the entries matching input query parameters using AND logic
                    dict_bibs[bib.get_value('791', 'a')] = bib.get_value('001')
                    str_bibs = str_bibs + ' OR 791:[' + bib.get_value(
                        '791', 'a') + ']'
    print(str_bibs)
    return jsonify(dict_bibs)
Beispiel #17
0
def jsons(date):
    '''
    outputs Security Council bib records in plain simple json format for the date which is provided as a dynamic route in YYYYMMDD or YYYY-MM-DD formats
    e.g. /YYYY-MM-DD/xml?skip=n&limit=m
    skip=n URL parameter is used to skip n records. Default is 0.
    limit=m URL parameter is used to limit number of records returned. Default is 50.
    if the date is in wrong format the function returns today's records
    it is used to publish S/ records for iSCAD+ in a plain json
    22 July added fields 049:a and 260:a
    '''
    try:
        skp = int(request.args.get('skip'))
    except:
        skp = 0
    try:
        limt = int(request.args.get('limit'))
    except:
        limt = 50
    print(f"skip is {skp} and limit is {limt}")
    #start_time_all=datetime.now()
    str_date = date.replace('-', '')
    print(f"the original str_date is {str_date}")
    if len(str_date) != 8:
        date = datetime.datetime.now()
        str_date = str(date.year) + str(date.month) + str(date.day)
    print(f"the str_date is {str_date}")
    #start_time_query=datetime.now()
    query = QueryDocument(
        Condition(tag='998', subfields={'z': re.compile('^' + str_date)}),
        Condition(tag='191', subfields={'b': re.compile('^S\/')}))
    export_fields = {
        '089': 1,
        '091': 1,
        '191': 1,
        '239': 1,
        '245': 1,
        '249': 1,
        '260': 1,
        '269': 1,
        '300': 1,
        '500': 1,
        '515': 1,
        '520': 1,
        '596': 1,
        '598': 1,
        '610': 1,
        '611': 1,
        '630:1,'
        '650': 1,
        '651': 1,
        '710': 1,
        '981': 1,
        '989': 1,
        '991': 1,
        '992': 1,
        '993': 1,
        '996': 1
    }
    bibset = BibSet.from_query(query,
                               projection=export_fields,
                               skip=skp,
                               limit=limt)
    out_list = [('089', 'b'), ('091', 'a'), ('191', 'a'), ('191', 'b'),
                ('191', 'c'), ('191', '9'), ('239', 'a'), ('245', 'a'),
                ('245', 'b'), ('249', 'a'), ('245', 'a'), ('260', 'a'),
                ('260', 'b'), ('260', 'a'), ('260', 'c'), ('269', 'a'),
                ('300', 'a'), ('500', 'a'), ('515', 'a'), ('520', 'a'),
                ('596', 'a'), ('598', 'a'), ('610', 'a'), ('611', 'a'),
                ('630', 'a'), ('650', 'a'), ('651', 'a'), ('710', 'a'),
                ('981', 'a'), ('989', 'a'), ('989', 'b'), ('989', 'c'),
                ('991', 'a'), ('991', 'b'), ('991', 'c'), ('991', 'd'),
                ('992', 'a'), ('993', 'a'), ('996', 'a')]
    #print(f"duration for query was {datetime.now()-start_time_query}")
    jsonl = []

    for bib in bibset.records:
        out_dict = {}
        #start_time_bib=datetime.now()
        for entry in out_list:
            #start_time_field=datetime.now()
            out_dict[entry[0] + '__' + entry[1]] = bib.get_values(
                entry[0], entry[1])
            #print(f"for the field {entry[0]+'__'+entry[1]}")
            #print(f"duration for getting values was {datetime.now()-start_time_field}")
        jsonl.append(out_dict)
        #print(f"for the bib {bib.get_values('191','a')}")
        #print(f"duration for getting bib values was {datetime.now()-start_time_bib}")
    #print(f"total duration was {datetime.now()-start_time_all}")
    return jsonify(jsonl)
Beispiel #18
0
def show_txt(path):
    '''displays the text of the document '''
    data = ""
    return_data = ""
    doc_list = []
    #path=quote(path)
    path = re.escape(path)
    '''
 i2 = urllib.parse.quote(i.encode("utf-8"))  #need to deal with special characters in each url
        uu2 = urllib.parse.urljoin(uu, i2)         #create url
    '''
    print(f" this is compiled path -- {'^' + str(path)+'$'}")
    doc_list = list(
        txts_coll.find({"doc_sym": {
            "$regex": "^" + str(path) + "$"
        }}))
    if len(doc_list) == 0 and path != 'favicon.ico':
        print(f"no exact DS {str(path)} - generating one")
        bib_value = ''
        #doc_list=list(txts_coll.find({"doc_sym":{"$regex":path}}))
        ''' extract text from DB'''
        #build list of tuples (striped_doc_sum, url to the pdf in s3)
        query = QueryDocument(
            Condition(tag='191', subfields={'a': Regex('^' + path + '$')}))
        #)
        print(f" the imp query is  -- {query.to_json()}")
        bibset = BibSet.from_query(query, skip=0, limit=3)
        a_res_en = []
        if bibset.count == 1:
            for bib in bibset.records:
                bib_value = bib.get_value('191', 'a')
                a_res_en.append(
                    (bib.get_value('191',
                                   'a'), 'http://' + ''.join(bib.files('EN'))))
                print(a_res_en)
                for url in a_res_en:
                    #txt_name = url.split('/')[-1]
                    #url is a tuple ; url[0] is a DS; url[1] is a s3 link to the pdf
                    txt_name = url[0]  # e.g. ARES721
                    #txt_name = txt_name.split('.')[0] +'.txt'
                    #txt_name = txt_name +'.txt'
                    #txt_loc='\\txts\\'+txt_name
                    if len(url[1]) > 10:
                        print(f" - - the {url[0]} is {url[1]} - -")
                        pdf = PDFExtract(url[1])
                        parsed = parser.from_buffer(
                            pdf.get_txt_from_url(url[1]))
                        print(f"0----PDFExtract----0")
                        txt = Txt(bib.get_value('191', 'a'))
                        print(txt.set_txt(parsed["content"]))
                        txt.title = bib.get_value('245', 'a')
                        #txt.title=bib.get_value('239','a')
                        ''' load text into txts'''
                        if txt.txt is not None:
                            query = {"doc_sym": txt.symbol}
                            txts_coll.replace_one(query,
                                                  txt.to_bson(),
                                                  upsert=True)

    doc_list = []
    doc_list = list(
        txts_coll.find({"doc_sym": {
            "$regex": "^" + str(path) + "$"
        }}))
    print(f" this is compiled path -- {'^' + str(path)+'$'}")
    if len(doc_list) == 1:
        print(f"-- it's a hit- 1")
        if doc_list[0]['doc_sym'][0] != 'S':
            return_data = doc_list[0]['raw_txt']
        else:
            #for SC docs - temporary measure
            doc_1 = doc_list[0].pop('_id')
            return_data = doc_list[0]
    elif len(doc_list) > 1:
        print(f"-- it's a hit- many")
        return_data = sorted([doc['doc_sym'] for doc in doc_list],
                             key=lambda x: int(''.join(c for c in x
                                                       if c.isdigit())))
        #return_data=sorted(["<a href="+doc['doc_sym']+">" for doc in doc_list])
        #return_data=sorted([url_for('/'+doc_list[0]['raw_txt']) for doc in doc_list])

    if return_data == "":
        return jsonify('text with document symbol:%s was not found' % path)
    #return(render_template('ds.html', data=return_data))
    #print(return_data)
    return jsonify(return_data)
Beispiel #19
0
from bson import Regex
from dlx import DB
from dlx.marc import BibSet, QueryDocument, Condition
from config import Config
DB.connect(Config.connect_string)

query = QueryDocument(Condition(tag='191', modifier='exists'),
                      Condition(tag='269', subfields={'a': Regex('^1975')}))

print(query.to_json())

bibset = BibSet.from_query(query, projection={'191': True}, skip=0, limit=0)
print('There are {} results'.format(bibset.count))

bibset.cache()

for bib in bibset.records:
    print('id: {}, symbol: {}'.format(bib.id, bib.get_value('191', 'a')))

print(bibset.to_xml())
Beispiel #20
0
def run():
    args = get_args()

    DLX.connect(args.dlx_connect)
    S3.connect(bucket=args.s3_bucket)

    symbols = [args.symbol] if args.symbol else [
        re.split('\t', x)[0].strip() for x in open(args.list).readlines()
    ]
    langs = [args.language] if args.language else LANG.keys()

    for sym in symbols:
        bib = Bib.from_query(Query(
            Or(Condition('191', {'a': sym}), Condition('191', {'z': sym}))),
                             collation=Collation(locale='en', strength=2))

        if not bib and not args.skip_check:
            logging.warning(f'Bib for document {sym} not found. Skipping.')
            continue
        elif bib and not args.skip_check:
            # capture symbols from the bib record (exclude those beginning with brackets)
            ids = list(
                filter(
                    lambda x: x[0] != '[',
                    (bib.get_values('191', 'a') + bib.get_values('191', 'z'))))
        else:
            logging.warning(
                f'Bib for document {sym} not found with --skip_check enabled. Using {sym} as identifier'
            )
            ids = symbols

        for lang in langs:
            logging.info(f'Getting {sym} {lang} ...')

            try:
                fh = ODS.download(
                    sym if not args.ods_symbol else args.ods_symbol, lang)
            except FileNotFound:
                logging.warning(f'{sym} {lang} not found in ODS')
                continue
            except Exception as e:
                logging.warning(e)
                continue

            isolang = LANG[lang]

            try:
                result = File.import_from_handle(
                    fh,
                    filename=File.encode_fn(sym, isolang, 'pdf'),
                    identifiers=[Identifier('symbol', s) for s in ids],
                    languages=[isolang],
                    mimetype='application/pdf',
                    source='ods-importx',
                    overwrite=args.overwrite)
                logging.info(f'OK - {result.id}')
            except FileExistsLanguageConflict as e:
                logging.warning(f'{e.message} X {isolang}')
            except FileExistsIdentifierConflict as e:
                logging.warning(f'{e.message} X {ids}')
            except FileExists:
                logging.info('Already in the system')
            except:
                raise