Beispiel #1
0
 def _move_to_done(obj, eng):
     target_folder_full = get_storage_path(suffix=target_folder)
     collections = obj.data.get('collections', dict())
     for filename in collections.values():
         move(filename, join(target_folder_full, basename(filename)))
         obj.log.info("Moved {0} to {1}".format(
             filename,
             target_folder_full)
         )
 def _get_files_from_ftp(obj, eng):
     target_folder_full = get_storage_path(suffix=target_folder)
     obj.data['all_files'], obj.data['new_files'] = ftp_download_files(
         source_folder,
         target_folder_full,
         server=server,
         netrc_file=get_netrc()
     )
     obj.log.info("{0} new files downloaded, in total {1} files".format(
         len(obj.data["new_files"]),
         len(obj.data["all_files"])
     ))
Beispiel #3
0
 def _get_files_from_ftp(obj, eng):
     netrc_file = obj.extra_data["config"].get("ftp_netrc_file")
     target_folder_full = get_storage_path(suffix=target_folder)
     obj.data['all_files'], obj.data['new_files'] = ftp_download_files(
         source_folder,
         target_folder_full,
         server=obj.extra_data["config"]["ftp_server"],
         netrc_file=netrc_file
     )
     obj.log.info("{0} new files downloaded, in total {1} files".format(
         len(obj.data["new_files"]),
         len(obj.data["all_files"])
     ))
    def _convert_files(obj, eng):
        from invenio_knowledge.api import get_kb_mappings
        mappings = dict(
            map(
                lambda item: (item['key'], item['value']),
                get_kb_mappings('JOURNALS')
            )
        )
        ws = WorldScientific(mappings)

        target_folder_full = get_storage_path(suffix=target_folder)

        args = obj.extra_data['args']
        # By default, we set the from date as today
        to_date = args.get("to_date") or datetime.now().strftime('%Y-%m-%d')

        # By last resort, we set the from date a week before
        from_date = args.get("from_date") or cache.get(date_key) \
            or (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')

        obj.extra_data['args']["to_date"] = to_date
        obj.extra_data['args']["from_date"] = from_date

        insert_files = []
        filenames = obj.data['extracted_files']
        for filename in filenames:
            date = ws.get_date(filename)
            if from_date <= date <= to_date:
                marc = ws.get_record(filename)
                if marc:
                    filename = basename(filename)
                    filename = join(target_folder_full, filename)
                    insert_files.append(filename)
                    with open(filename, 'w') as outfile:
                        outfile.write(marc)

        obj.log.info("Converted {0} articles between {1} to {2}".format(
            len(insert_files),
            from_date,
            to_date
        ))

        obj.data['insert'] = insert_files
        obj.data["result_path"] = target_folder_full

        obj.log.debug("Saved converted files to {0}".format(target_folder_full))
        obj.log.debug("{0} files to add".format(
            len(obj.data["insert"]),
        ))
 def _unzip_files(obj, eng):
     target_folder_full = get_storage_path(suffix=target_folder)
     filenames = obj.data.get('all_files', list())
     extracted_files = []
     for filename in filenames:
         try:
             extracted_files.extend(unzip(filename, target_folder_full))
         except BadZipfile as e:
             obj.log.error("Error unzipping file {0}: {1}".format(
                 filename,
                 e
             ))
             pass
     obj.data['extracted_files'] = extracted_files
     obj.log.debug("{0} new files extracted".format(
         len(obj.data["extracted_files"])
     ))
Beispiel #6
0
    def _convert_files(obj, eng):
        from invenio_knowledge.api import get_kb_mappings
        mappings = dict(
            map(
                lambda item: (item['key'], item['value']),
                get_kb_mappings('JOURNALS')
            )
        )
        ws = WorldScientific(mappings)

        target_folder_full = get_storage_path(suffix=target_folder)

        args = obj.extra_data['args']

        # By default, we set the from date as today
        to_date = args.get("to_date") or datetime.now().strftime('%Y-%m-%d')

        # By last resort, we set the from date months before
        from_date = args.get("from_date")

        if not from_date:
            if args.get("reharvest"):
                # Since "beginning" of time when not specified
                from_date = datetime.strptime("1900-01-01", "%Y-%m-%d")
            else:
                # Dynamic date in the past when not specified and not reharvest
                from_date = datetime.now() - timedelta(weeks=weeks_threshold)\
                    .strftime('%Y-%m-%d')

        obj.extra_data['args']["to_date"] = to_date
        obj.extra_data['args']["from_date"] = from_date

        insert_files = []
        if args.get("reharvest"):
            filenames = obj.data['all_extracted_files']
        else:
            filenames = obj.data['newly_extracted_files']

        for filename in filenames:
            date = ws.get_date(filename)
            if date is None or (from_date <= date <= to_date):
                marc = ws.get_record(filename)
                if marc:
                    filename = basename(filename)
                    filename = join(target_folder_full, filename)
                    insert_files.append(filename)
                    with open(filename, 'w') as outfile:
                        outfile.write(marc)
            else:
                obj.log.info("Filtered out {0} ({1})".format(filename, date))

        obj.log.info("Converted {0}/{1} articles between {2} to {3}".format(
            len(insert_files),
            len(filenames),
            from_date,
            to_date
        ))

        obj.data['insert'] = insert_files
        obj.data["result_path"] = target_folder_full

        obj.log.debug("Saved converted files to {0}".format(target_folder_full))
        obj.log.debug("{0} files to add".format(
            len(obj.data["insert"]),
        ))