def create_queries_one_by_one(queries): # iterates over each query. Better way is to use divide-and-conqure for query in queries: try: query.save() except Exception: json_log(error='Loading to database failed', exception=True)
def extract_and_load_docs(paths, pool=None): """Main driver for loading all XML from a path to a database Parameters ---------- path : string This can either be a directory to be recursed (containing XML or Zip or TAR), or a single Zip or TAR file. """ if isinstance(paths, basestring): paths = [paths] def already_saved(eid): return Document.objects.filter(eid=eid).exists() xml_pairs = itertools.chain.from_iterable( generate_xml_pairs(path, _with_retry(already_saved)) for path in paths) if pool is None: try: from itertools import imap except ImportError: imap = map else: imap = functools.partial(pool.imap_unordered, chunksize=200) counter = -1 doc_records = [] for counter, doc_record in enumerate(imap(_process_one, xml_pairs)): if counter % MAX_BATCH_SIZE == 0: if counter > 0: logging.info('Saving after %d records' % counter) load_to_db(doc_records) doc_records = [] if doc_record is None: continue doc_records.append(doc_record) if counter < 0: json_log(error='Processed 0 records!', method=logging.error) return # At end of the year, flush out all remaining records logging.info('Saving after %d records' % counter) load_to_db(doc_records) logging.info('Done')
def generate_xml_pairs(path, eid_filter=None, count_only=False): """Finds and returns contents for pairs of XML documents and citedby path may be: * a directory in which to find XML/TAR/ZIP files * a tar file * a zip file """ n_skips = 0 backlog = {} for path, f in _generate_files(path): if not path.endswith('.xml'): if f is not None: f.close() continue # TODO: filter before opening, or after pairing to avoid DB queries if (eid_filter is not None and eid_filter(int(re.findall('(?<=2-s2.0-)[0-9]+', path)[-1]))): n_skips += 1 if n_skips % 100000 == 0: json_log(info='Skipped %d files so far' % n_skips, method=logging.info) f.close() continue if count_only: xml = None else: xml = f.read() f.close() key = os.path.dirname(path) yield path, xml, xml continue if key in backlog: other_path, other_xml = backlog.pop(key) if other_path == path: json_log(error='Found duplicate xmls for %r' % path, method=logging.error) backlog[key] = (path, xml) continue if path.endswith('citedby.xml'): yield other_path, other_xml, xml else: assert other_path.endswith('citedby.xml'), other_path yield path, xml, other_xml else: backlog[key] = (path, xml) if n_skips: json_log(info='Skipped %d files (two per doc) altogether' % n_skips, method=logging.warning) if backlog: json_log(error='Found unpaired XML files: %s' % [path for path, _ in backlog.values()], exception=True, method=logging.error)
def load_to_db(doc_records): """Save Django objects Save referenced sources first, then attempt to bulk create all documents and associated records atomically, falling back to creating each document and associated records atomically. """ for doc_record in doc_records: doc = doc_record[0] # source = doc.source # try: # db_source, created = _with_retry(Source.get_or_create)( # scopus_source_id=source.scopus_source_id, # issn_print=source.issn_print, # issn_electronic=source.issn_electronic) # except Exception: # json_log(error='Loading to database failed', # context={'object': source}, # exception=True) # source.pk = db_source.pk # if created: # # store other fields # source.save() # assert doc.source.pk is not None # doc.source_id = doc.source.pk try: _with_retry(bulk_create)(doc_records) except Exception: json_log(error='Falling back to one-by-one', method=logging.debug) # When transaction as bulk is failed, then go through each query # one by one and create them. Also, log failed queries. for doc_record in doc_records: try: _with_retry(create_doc)(doc_record) except Exception: json_log(error='Loading to database failed', context={'eid': doc_record[0].eid}, exception=True) finally: # Avoid memory leak when DEBUG == True django.db.reset_queries()
def truncate_fields(obj): try: # memoize trunc_data = obj._meta.trunc_data except AttributeError: trunc_data = [(f.name, f.max_length) for f in obj._meta.get_fields() if getattr(f, 'max_length', None) is not None] obj._meta.trunc_data = trunc_data for name, max_length in trunc_data: val = getattr(obj, name, None) if val is not None and len(val) > max_length: json_log(error='Truncation of oversize {}.{} (max_length={})'.format(type(obj).__name__, name, max_length), length=len(val), context={'eid': eid, 'obj': smart_text(obj)}) setattr(obj, name, val[:max_length]) return obj # allow chaining
def _process_one(tup): path, doc_file, citedby_file = tup try: item = {'document': extract_document_information(doc_file), 'citation': "" } #extract_document_citations(citedby_file)} except Exception: json_log(error='Uncaught error in extraction from XML', context={'path': path}, exception=True) return if item['document'] is not None: try: return aggregate_records(item) except Exception: json_log(error='Uncaught error in producing django records', context={'eid': item['document'].get('eid')}, exception=True) return
def extract_and_load_docs(path): counter = -1 itemid_batch = [] authorship_batch = [] document_batch = [] citation_batch = [] for counter, (path, doc_file, citedby_file) in enumerate(generate_xml_pairs(path)): if counter % MAX_BATCH_SIZE == 0: if counter > 0: logging.info('Saving after %d records' % counter) load_to_db(itemid_batch, authorship_batch, citation_batch, document_batch) itemid_batch = [] authorship_batch = [] document_batch = [] citation_batch = [] item = {'document': extract_document_information(doc_file), 'citation': extract_document_citations(citedby_file)} if item['document'] is None: json_log(method=logging.error, error='Issue on xml_extract', path=path, exception=True) else: (itemids, authorships, citations, documents) = aggregate_records(item) itemid_batch.extend(itemids) authorship_batch.extend(authorships) citation_batch.extend(citations) document_batch.extend(documents) if counter < 0: json_log(error='Processed 0 records!', exception=True) return # At end of the year, flush out all remaining records logging.info('Saving after %d records' % counter) load_to_db(itemid_batch, authorship_batch, citation_batch, document_batch) logging.info('Done')
def run(self): json_log(info='Started {}'.format(self.path)) start = time.time() extract_and_load_docs(self.path) json_log(info='Processing of {} took {} seconds'.format(self.path, time.time() - start))