Beispiel #1
0
def create_queries_one_by_one(queries):
    # iterates over each query. Better way is to use divide-and-conqure
    for query in queries:
        try:
            query.save()
        except Exception:
            json_log(error='Loading to database failed', exception=True)
Beispiel #2
0
def extract_and_load_docs(paths, pool=None):
    """Main driver for loading all XML from a path to a database

    Parameters
    ----------
    path : string

        This can either be a directory to be recursed (containing XML or Zip or
        TAR), or a single Zip or TAR file.
    """
    if isinstance(paths, basestring):
        paths = [paths]

    def already_saved(eid):
        return Document.objects.filter(eid=eid).exists()

    xml_pairs = itertools.chain.from_iterable(
        generate_xml_pairs(path, _with_retry(already_saved))
        for path in paths)

    if pool is None:
        try:
            from itertools import imap
        except ImportError:
            imap = map
    else:
        imap = functools.partial(pool.imap_unordered, chunksize=200)

    counter = -1
    doc_records = []

    for counter, doc_record in enumerate(imap(_process_one, xml_pairs)):
        if counter % MAX_BATCH_SIZE == 0:
            if counter > 0:
                logging.info('Saving after %d records' % counter)
                load_to_db(doc_records)
            doc_records = []

        if doc_record is None:
            continue

        doc_records.append(doc_record)

    if counter < 0:
        json_log(error='Processed 0 records!', method=logging.error)
        return

    # At end of the year, flush out all remaining records
    logging.info('Saving after %d records' % counter)
    load_to_db(doc_records)
    logging.info('Done')
Beispiel #3
0
def generate_xml_pairs(path, eid_filter=None, count_only=False):
    """Finds and returns contents for pairs of XML documents and citedby

    path may be:
        * a directory in which to find XML/TAR/ZIP files
        * a tar file
        * a zip file
    """
    n_skips = 0
    backlog = {}
    for path, f in _generate_files(path):
        if not path.endswith('.xml'):
            if f is not None:
                f.close()
            continue

        # TODO: filter before opening, or after pairing to avoid DB queries
        if (eid_filter is not None
                and eid_filter(int(re.findall('(?<=2-s2.0-)[0-9]+', path)[-1]))):
            n_skips += 1
            if n_skips % 100000 == 0:
                json_log(info='Skipped %d files so far' % n_skips,
                         method=logging.info)
            f.close()
            continue
        if count_only:
            xml = None
        else:
            xml = f.read()
            f.close()
        key = os.path.dirname(path)
        yield path, xml, xml
        continue
        if key in backlog:
            other_path, other_xml = backlog.pop(key)
            if other_path == path:
                json_log(error='Found duplicate xmls for %r' % path,
                         method=logging.error)
                backlog[key] = (path, xml)
                continue
            if path.endswith('citedby.xml'):
                yield other_path, other_xml, xml
            else:
                assert other_path.endswith('citedby.xml'), other_path
                yield path, xml, other_xml

        else:
            backlog[key] = (path, xml)

    if n_skips:
        json_log(info='Skipped %d files (two per doc) altogether' % n_skips,
                 method=logging.warning)
    if backlog:
        json_log(error='Found unpaired XML files: %s'
                 % [path for path, _ in backlog.values()],
                 exception=True,
                 method=logging.error)
Beispiel #4
0
def load_to_db(doc_records):
    """Save Django objects

    Save referenced sources first, then attempt to bulk create
    all documents and associated records atomically, falling
    back to creating each document and associated records atomically.
    """

    for doc_record in doc_records:
        doc = doc_record[0]
#       source = doc.source
#        try:
#            db_source, created = _with_retry(Source.get_or_create)(
#                scopus_source_id=source.scopus_source_id,
#                issn_print=source.issn_print,
#                issn_electronic=source.issn_electronic)
#        except Exception:
#            json_log(error='Loading to database failed',
#                     context={'object': source},
#                     exception=True)
#        source.pk = db_source.pk
#        if created:
#            # store other fields
#            source.save()
#        assert doc.source.pk is not None
#        doc.source_id = doc.source.pk

    try:
        _with_retry(bulk_create)(doc_records)
    except Exception:
        json_log(error='Falling back to one-by-one',
                 method=logging.debug)
        # When transaction as bulk is failed, then go through each query
        # one by one and create them. Also, log failed queries.
        for doc_record in doc_records:
            try:
                _with_retry(create_doc)(doc_record)
            except Exception:
                json_log(error='Loading to database failed',
                         context={'eid': doc_record[0].eid},
                         exception=True)
    finally:
        # Avoid memory leak when DEBUG == True
        django.db.reset_queries()
Beispiel #5
0
    def truncate_fields(obj):
        try:
            # memoize
            trunc_data = obj._meta.trunc_data
        except AttributeError:
            trunc_data = [(f.name, f.max_length)
                          for f in obj._meta.get_fields()
                          if getattr(f, 'max_length', None) is not None]
            obj._meta.trunc_data = trunc_data

        for name, max_length in trunc_data:
            val = getattr(obj, name, None)
            if val is not None and len(val) > max_length:
                json_log(error='Truncation of oversize {}.{} (max_length={})'.format(type(obj).__name__, name, max_length),
                         length=len(val),
                         context={'eid': eid, 'obj': smart_text(obj)})
                setattr(obj, name, val[:max_length])

        return obj  # allow chaining
Beispiel #6
0
def _process_one(tup):
    path, doc_file, citedby_file = tup
    try:
        item = {'document': extract_document_information(doc_file),
                'citation': "" } #extract_document_citations(citedby_file)}
    except Exception:
        json_log(error='Uncaught error in extraction from XML',
                 context={'path': path},
                 exception=True)
        return

    if item['document'] is not None:
        try:
            return aggregate_records(item)
        except Exception:
            json_log(error='Uncaught error in producing django records',
                     context={'eid': item['document'].get('eid')},
                     exception=True)
            return
Beispiel #7
0
def extract_and_load_docs(path):
    counter = -1
    itemid_batch = []
    authorship_batch = []
    document_batch = []
    citation_batch = []

    for counter, (path, doc_file, citedby_file) in enumerate(generate_xml_pairs(path)):
        if counter % MAX_BATCH_SIZE == 0:
            if counter > 0:
                logging.info('Saving after %d records' % counter)
                load_to_db(itemid_batch, authorship_batch,
                           citation_batch, document_batch)
            itemid_batch = []
            authorship_batch = []
            document_batch = []
            citation_batch = []

        item = {'document': extract_document_information(doc_file),
                'citation': extract_document_citations(citedby_file)}

        if item['document'] is None:
            json_log(method=logging.error, error='Issue on xml_extract', path=path, exception=True)
        else:
            (itemids, authorships, citations, documents) = aggregate_records(item)
            itemid_batch.extend(itemids)
            authorship_batch.extend(authorships)
            citation_batch.extend(citations)
            document_batch.extend(documents)

    if counter < 0:
        json_log(error='Processed 0 records!', exception=True)
        return

    # At end of the year, flush out all remaining records
    logging.info('Saving after %d records' % counter)
    load_to_db(itemid_batch, authorship_batch, citation_batch, document_batch)
    logging.info('Done')
Beispiel #8
0
 def run(self):
     json_log(info='Started {}'.format(self.path))
     start = time.time()
     extract_and_load_docs(self.path)
     json_log(info='Processing of {} took {} seconds'.format(self.path, time.time() - start))