def count_and_store(): for docfile, doc in generate_marcfiles(): print("Processing {0}".format(docfile)) for item in doc: addlang(get_language(doc)) with open("langcounts.json", "w") as langcountfile: json.dump(langcount, langcountfile) with open("langs.json", "w") as langfile: json.dump(list(langs), langfile)
def sfx_rosetta(reverse_order = False): with open("sfx_mapping.tsv", "w") as sfxfile: sfxfile.write("System Number\t$b\t$j\n") for docfile, doc in generate_marcfiles(reverse_order): print("Processing {0}".format(docfile)) for item in doc: sysnumber = item.get_fields()[0].data domid = "" b = "" eightfivetwo = filter(lambda x: x.tag == "852", item.get_fields()) for loc in eightfivetwo: c = loc.get_subfields("c") if c and "SFX" in c: domid = loc.get_subfields("j")[0] b = loc.get_subfields("b")[0] if domid: print("HIT! '{0}' might link to '{1}'. PDF at http://access.dl.bl.uk/{1}".format(sysnumber, domid)) sfxfile.write("{0}\t{1}\t{2}\n".format(sysnumber, b, domid))
def field_coverage(): count = 0 from collections import defaultdict coverage = defaultdict(lambda: 0) for docfile, doc in generate_marcfiles(): print("Processing {0}".format(docfile)) for item in doc: field_set = set() for field in item.as_dict()["fields"]: for fkey in field.keys(): field_set.add(fkey) for fflag in list(field_set): coverage[fflag] += 1 count += 1 print("-- {0} --".format(docfile)) for k in sorted(coverage.keys()): print(". {0} {1:07d}".format(k, coverage[k])) print("\n\n") return coverage, count
def sfx_rosetta(reverse_order=False): with open("sfx_mapping.tsv", "w") as sfxfile: sfxfile.write("System Number\t$b\t$j\n") for docfile, doc in generate_marcfiles(reverse_order): print("Processing {0}".format(docfile)) for item in doc: sysnumber = item.get_fields()[0].data domid = "" b = "" eightfivetwo = filter(lambda x: x.tag == "852", item.get_fields()) for loc in eightfivetwo: c = loc.get_subfields("c") if c and "SFX" in c: domid = loc.get_subfields("j")[0] b = loc.get_subfields("b")[0] if domid: print( "HIT! '{0}' might link to '{1}'. PDF at http://access.dl.bl.uk/{1}" .format(sysnumber, domid)) sfxfile.write("{0}\t{1}\t{2}\n".format( sysnumber, b, domid))
from utils.marcutils import generate_marcfiles, get_solr_doc, collate import pysolr from settings import SOLR if __name__ == "__main__": print("Attempting Solr connection on '{0}'".format(SOLR)) solr = pysolr.Solr(SOLR, timeout = 100) # long timeout as the final optimize step can be long cname = "" count = 0 cc = 1 docs = [] for fname, marcdoc in generate_marcfiles(): for marcfile in marcdoc: if cname != fname: cname = fname cc = 1 if count: print("Uploading {0} to Solr... ({1} completed)".format(fname, str(count))) else: print("Uploading {0} to Solr...".format(fname)) doc = get_solr_doc(collate(marcfile)) docs.append(doc) count += 1 if not(cc % 100): print("{0} - processed".format(str(cc))) solr.add(docs) docs = [] cc += 1 solr.add(docs) print("Job complete. {0} records uploaded to Solr".format(str(count)))
from utils.marcutils import generate_marcfiles, get_solr_doc, collate import pysolr from settings import SOLR if __name__ == "__main__": print("Attempting Solr connection on '{0}'".format(SOLR)) solr = pysolr.Solr( SOLR, timeout=100) # long timeout as the final optimize step can be long cname = "" count = 0 cc = 1 docs = [] for fname, marcdoc in generate_marcfiles(): for marcfile in marcdoc: if cname != fname: cname = fname cc = 1 if count: print("Uploading {0} to Solr... ({1} completed)".format( fname, str(count))) else: print("Uploading {0} to Solr...".format(fname)) doc = get_solr_doc(collate(marcfile)) docs.append(doc) count += 1 if not (cc % 100): print("{0} - processed".format(str(cc))) solr.add(docs) docs = []