def count_and_store():
  for docfile, doc in generate_marcfiles():
    print("Processing {0}".format(docfile))
    for item in doc:
      addlang(get_language(doc))

  with open("langcounts.json", "w") as langcountfile:
    json.dump(langcount, langcountfile)
  with open("langs.json", "w") as langfile:
    json.dump(list(langs), langfile)
def count_and_store():
    for docfile, doc in generate_marcfiles():
        print("Processing {0}".format(docfile))
        for item in doc:
            addlang(get_language(doc))

    with open("langcounts.json", "w") as langcountfile:
        json.dump(langcount, langcountfile)
    with open("langs.json", "w") as langfile:
        json.dump(list(langs), langfile)
def sfx_rosetta(reverse_order = False):
  with open("sfx_mapping.tsv", "w") as sfxfile:
    sfxfile.write("System Number\t$b\t$j\n")
    for docfile, doc in generate_marcfiles(reverse_order):
      print("Processing {0}".format(docfile))
      for item in doc:
        sysnumber = item.get_fields()[0].data
        domid = ""
        b = ""
        eightfivetwo = filter(lambda x: x.tag == "852", item.get_fields())
        for loc in eightfivetwo:
          c = loc.get_subfields("c")
          if c and "SFX" in c:
            domid = loc.get_subfields("j")[0]
            b = loc.get_subfields("b")[0]
        if domid:
          print("HIT! '{0}' might link to '{1}'. PDF at http://access.dl.bl.uk/{1}".format(sysnumber, domid))
          sfxfile.write("{0}\t{1}\t{2}\n".format(sysnumber, b, domid))
def field_coverage():
  count = 0
  from collections import defaultdict
  coverage = defaultdict(lambda: 0)
  for docfile, doc in generate_marcfiles():
    print("Processing {0}".format(docfile))
    for item in doc:
      field_set = set()
      for field in item.as_dict()["fields"]:
        for fkey in field.keys():
          field_set.add(fkey)
      
      for fflag in list(field_set):
        coverage[fflag] += 1
      count += 1
    print("--  {0}  --".format(docfile))
    for k in sorted(coverage.keys()):
      print(". {0}   {1:07d}".format(k, coverage[k]))
    print("\n\n")
  return coverage, count
def field_coverage():
    count = 0
    from collections import defaultdict
    coverage = defaultdict(lambda: 0)
    for docfile, doc in generate_marcfiles():
        print("Processing {0}".format(docfile))
        for item in doc:
            field_set = set()
            for field in item.as_dict()["fields"]:
                for fkey in field.keys():
                    field_set.add(fkey)

            for fflag in list(field_set):
                coverage[fflag] += 1
            count += 1
        print("--  {0}  --".format(docfile))
        for k in sorted(coverage.keys()):
            print(". {0}   {1:07d}".format(k, coverage[k]))
        print("\n\n")
    return coverage, count
def sfx_rosetta(reverse_order=False):
    with open("sfx_mapping.tsv", "w") as sfxfile:
        sfxfile.write("System Number\t$b\t$j\n")
        for docfile, doc in generate_marcfiles(reverse_order):
            print("Processing {0}".format(docfile))
            for item in doc:
                sysnumber = item.get_fields()[0].data
                domid = ""
                b = ""
                eightfivetwo = filter(lambda x: x.tag == "852",
                                      item.get_fields())
                for loc in eightfivetwo:
                    c = loc.get_subfields("c")
                    if c and "SFX" in c:
                        domid = loc.get_subfields("j")[0]
                        b = loc.get_subfields("b")[0]
                if domid:
                    print(
                        "HIT! '{0}' might link to '{1}'. PDF at http://access.dl.bl.uk/{1}"
                        .format(sysnumber, domid))
                    sfxfile.write("{0}\t{1}\t{2}\n".format(
                        sysnumber, b, domid))
from utils.marcutils import generate_marcfiles, get_solr_doc, collate
import pysolr

from settings import SOLR

if __name__ == "__main__":
  print("Attempting Solr connection on '{0}'".format(SOLR))
  solr = pysolr.Solr(SOLR, timeout = 100)    # long timeout as the final optimize step can be long
  cname = ""
  count = 0
  cc = 1
  docs = []
  for fname, marcdoc in generate_marcfiles():
    for marcfile in marcdoc:
      if cname != fname:
        cname = fname
        cc = 1
        if count:
          print("Uploading {0} to Solr...    ({1} completed)".format(fname, str(count)))
        else:
          print("Uploading {0} to Solr...".format(fname))
      doc = get_solr_doc(collate(marcfile))
      docs.append(doc)
      count += 1
      if not(cc % 100):
        print("{0} - processed".format(str(cc)))
        solr.add(docs)
        docs = []
      cc += 1
  solr.add(docs)
  print("Job complete. {0} records uploaded to Solr".format(str(count)))
Esempio n. 8
0
from utils.marcutils import generate_marcfiles, get_solr_doc, collate
import pysolr

from settings import SOLR

if __name__ == "__main__":
    print("Attempting Solr connection on '{0}'".format(SOLR))
    solr = pysolr.Solr(
        SOLR,
        timeout=100)  # long timeout as the final optimize step can be long
    cname = ""
    count = 0
    cc = 1
    docs = []
    for fname, marcdoc in generate_marcfiles():
        for marcfile in marcdoc:
            if cname != fname:
                cname = fname
                cc = 1
                if count:
                    print("Uploading {0} to Solr...    ({1} completed)".format(
                        fname, str(count)))
                else:
                    print("Uploading {0} to Solr...".format(fname))
            doc = get_solr_doc(collate(marcfile))
            docs.append(doc)
            count += 1
            if not (cc % 100):
                print("{0} - processed".format(str(cc)))
                solr.add(docs)
                docs = []