Ejemplo n.º 1
0
def create_collection(converted_files, new_files):
    """Creates the record collection file
    uploads it to the FTP server and sends
    an email to inform about the harvest"""
    target_file = "edpsciences.%s.xml" % \
                  (datetime.now().strftime("%Y-%m-%d"),)
    target_file = join(CFG_EDPSCIENCE_OUT_FOLDER, target_file)
    write_message("Creating collection file: %s" % (target_file,))
    with open(target_file, 'w') as collection:
        collection.write('<collection>\n')
        for fl in converted_files:
            recordfile = open(fl)
            collection.write(recordfile.read())
            recordfile.close()
        collection.write('\n</collection>')
    submit_records_via_ftp(target_file)
    body = ['From %s sources, found and converted %s records'
            % (len(new_files), len(converted_files)),
            '\t%s records ready to upload:\n'
            % (len(converted_files),),
            '\t%s uploaded to server:'
            % (target_file,)]
    body = '\n'.join(body)
    subject = "EDP Sciences harvest results: %s" % \
              (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),)
    write_message(body)
    if submit_records_via_mail(subject, body, CFG_SITE_SUPPORT_EMAIL):
        write_message("Mail sent to %r" % (CFG_SITE_SUPPORT_EMAIL,))
    else:
        write_message("ERROR: Cannot send mail.")
Ejemplo n.º 2
0
def create_collection(batch_size, new_files, new_sources, directory, submit):
    """Create a single xml file "collection.xml"
    that contains all the records."""
    subject = "Consyn harvest results: %s" % \
              (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),)
    batch = 1
    counter = 1
    files_to_upload = []
    collection = None
    date = datetime.now().strftime("%Y.%m.%d")
    prefix = "elsevier-{0}".format(date)
    for filename in new_files:
        if counter == 1:
            filepath = get_available_filename(directory, prefix, batch)
            collection = open(filepath, 'w')
            collection.write("<collection>\n")
        with open(filename) as f:
            collection.write(f.read() + '\n')
            counter += 1
        if counter == batch_size:
            collection.write("</collection>")
            collection.close()
            files_to_upload.append(filepath)
            counter = 1
            batch += 1
    if counter < batch_size and collection:
        collection.write("</collection>")
        collection.close()
        files_to_upload.append(filepath)
    body = [
        'From %s sources, found and converted %s records' %
        (len(new_sources), len(new_files)),
        '\t%s records ready to upload:\n' % ((batch - 1) * 500 + counter, )
    ]
    if submit:
        body += ['\tFiles uploaded to Server:']
        for filepath in files_to_upload:
            try:
                submit_records_via_ftp(filepath)
                filename = filepath.split('/')[-1]
                body.append("\t%s (%s records)" % (filename, batch_size))
            except:
                _errors_detected.append(
                    Exception("Failed to upload %s to FTP server" % filepath))
                write_message("Failed to upload %s to FTP server" % filepath)
    else:
        body += ['\tFiles ready for upload:']
        for filename in files_to_upload:
            body.append("\t%s (%s records)" % (filename, batch_size))
    if files_to_upload:
        body = '\n'.join(body)
        write_message(subject)
        write_message(body)
        if submit:
            if submit_records_via_mail(subject, body, CFG_CONSYNHARVEST_EMAIL):
                write_message("Mail sent to %r" % (CFG_CONSYNHARVEST_EMAIL, ))
            else:
                write_message("ERROR: Cannot send mail.")
    else:
        write_message("No new files!")
Ejemplo n.º 3
0
    def check_records(self):
        """
        Checks if given records exists on the system and then returns
        a tuple of records that is new and records that exists:

        @return: a tuple of (new_records, existing_records)
        @rtype: tuple
        """
        # We check if any records already exists
        new_records = []
        existing_records = []
        for record in self.records_harvested:
            # Do we already have the record id perhaps?
            if not record.recid:
                try:
                    record.recid = get_record_from_doi(record.doi)
                except APSHarvesterSearchError, e:
                    write_message("Error while getting recid from %s: %s" %
                                  (record.doi, str(e)))

                    # Problem detected, send mail immediately:
                    problem_rec = generate_xml_for_records(
                        records=[record],
                        directory=self.out_folder,
                        suffix="problem.xml")
                    subject = "APS harvest problem: %s" % \
                              (self.date_started.strftime("%Y-%m-%d %H:%M:%S"),)
                    body = "There was a problem harvesting %s. \n %s \n Path: \n%s" % \
                           (record.doi, str(e), problem_rec)
                    submit_records_via_mail(subject, body,
                                            CFG_APSHARVEST_EMAIL)
                    continue

            # What about now?
            if record.recid:
                existing_records.append(record)
            else:
                new_records.append(record)
Ejemplo n.º 4
0
    def check_records(self):
        """
        Checks if given records exists on the system and then returns
        a tuple of records that is new and records that exists:

        @return: a tuple of (new_records, existing_records)
        @rtype: tuple
        """
        # We check if any records already exists
        new_records = []
        existing_records = []
        for record in self.records_harvested:
            # Do we already have the record id perhaps?
            if not record.recid:
                try:
                    record.recid = get_record_from_doi(record.doi)
                except APSHarvesterSearchError, e:
                    write_message("Error while getting recid from %s: %s" %
                                  (record.doi, str(e)))

                    # Problem detected, send mail immediately:
                    problem_rec = generate_xml_for_records(records=[record],
                                                           directory=self.out_folder,
                                                           suffix="problem.xml")
                    subject = "APS harvest problem: %s" % \
                              (self.date_started.strftime("%Y-%m-%d %H:%M:%S"),)
                    body = "There was a problem harvesting %s. \n %s \n Path: \n%s" % \
                           (record.doi, str(e), problem_rec)
                    submit_records_via_mail(subject, body, CFG_APSHARVEST_EMAIL)
                    continue

            # What about now?
            if record.recid:
                existing_records.append(record)
            else:
                new_records.append(record)
Ejemplo n.º 5
0
    def submit_records(self, records_filename, mode, update=False,
                       taskid=0, silent=False):
        """
        Performs the logic to submit given file (filepath) of records
        either by e-mail or using BibUpload with given mode.

        Taskid is given to indicate if the task submission should wait for any
        previously submitted tasks.

        The submission can also be made "silent" in the sense of not
        updating the modification date of the records.

        @param records_filename: filepath to XML file containing records.
        @type records_filename: string

        @param records_list: list of APSRecord objects for records
        @type records_list: list

        @param mode: which submission mode is it?
        @type mode: string

        @param taskid: bibsched taskid, wait for task to complete before submission
        @type taskid: int

        @param silent: do not update the modification date of the records
        @type silent: bool

        @return: returns the given taskid upon submission, or True/False from email.
        """
        if update:
            records_list = self.records_to_update
        else:
            records_list = self.records_to_insert

        # Check if we should create bibupload or e-mail
        if mode == "email":
            # Lets parse the records and find our IDs.
            list_of_dois = []
            for record in records_list:
                # We strip away the first part of the DOI for readability.
                list_of_dois.append('/'.join(record.doi.split('/')[1:]))
            # We send an e-mail to CFG_APSHARVEST_EMAIL and put file on AFS.
            body = "Harvested new records: %s" % (records_filename,)
            try:
                try:
                    shutil.move(records_filename, self.out_folder)
                    records_filename = os.path.join(self.out_folder,
                                                    os.path.basename(records_filename))
                    body = "Harvested new records on %s. They are located here:\n %s" % \
                           (self.date_started.strftime("%Y-%m-%d %H:%M:%S"), records_filename)
                except IOError, e:
                    # Some IOError
                    body = "Error while harvesting records: \nError saving %s - %s" % \
                           (records_filename, str(e))
                    raise e
            finally:
                submit_records_via_ftp(records_filename)
                body = "%s\nRecords harvested (%s total):\n%s\n" % (body,
                                                                    str(len(list_of_dois)),
                                                                    "\n".join(list_of_dois))

                body = "%s\nUploaded to FTP: %s" % (
                    body,
                    os.path.basename(records_filename)
                )

                res = submit_records_via_mail(self.mail_subject, body, CFG_APSHARVEST_EMAIL)
                write_message("Sent e-mail to %s with path to %s" %
                              (CFG_APSHARVEST_EMAIL, records_filename))
                return res
        else:
            # We submit a BibUpload task and wait for it to finish
            task_update_progress("Waiting for task to finish")

            if taskid != 0:
                write_message("Going to wait for %d to finish" % (taskid,))

            while not can_launch_bibupload(taskid):
                # Lets wait until the previously launched task exits.
                task_sleep_now_if_required(can_stop_too=False)
                time.sleep(5.0)

            taskid = submit_bibupload_for_records(mode, records_filename, silent)
            write_message("Submitted BibUpload task #%s with mode %s" %
                         (str(taskid), mode))
            return taskid
Ejemplo n.º 6
0
    def process_record_submission(self, parameters):
        """Run the submission process."""
        if parameters.get("match"):
            # We will do a simple match with the database
            new_records, existing_records = self.check_records()
            self.records_to_insert.extend(new_records)
            self.records_to_update.extend(existing_records)
        else:
            # We insert everything
            self.records_to_insert.extend(self.records_harvested)

        if self.records_to_insert:
            # Submit new records
            record_filename = generate_xml_for_records(
                self.records_to_insert,
                self.out_folder,
                prefix=self.get_file_prefix(parameters),
                suffix="_insert.xml"
            )

            if not parameters.get("devmode"):
                taskid = self.submit_records(record_filename,
                                             parameters.get("new_mode"))
                if not taskid:
                    # Something went wrong
                    err_string = "New records (%s)" \
                                 " were not submitted correctly" % \
                                 (record_filename,)
                    raise APSHarvesterSubmissionError(err_string)
            self.records_to_insert = []

        if self.records_to_update:
            # Submit new records
            record_filename = generate_xml_for_records(
                self.records_to_update,
                self.out_folder,
                prefix=self.get_file_prefix(parameters),
                suffix="_update.xml"
            )

            if not parameters.get("devmode"):
                taskid = self.submit_records(record_filename,
                                             parameters.get("update_mode"),
                                             update=True,
                                             silent=parameters.get("records") and True or False,)
                if not taskid:
                    # Something went wrong
                    err_string = "Existing records (%s)" \
                                 " were not submitted correctly" % \
                                 (record_filename,)
                    raise APSHarvesterSubmissionError(err_string)
            self.records_to_update = []

        if self.records_failed:
            body = "\n".join(["%s failed with error: %s"
                              % (rec.doi or rec.recid, msg)
                              for rec, msg in self.records_failed])
            if not parameters.get("devmode"):
                submit_records_via_mail(subject="%s (failed records)" % (self.mail_subject,),
                                        body=body,
                                        toaddr=CFG_APSHARVEST_EMAIL)
Ejemplo n.º 7
0
def create_collection(batch_size, new_files, new_sources,
                      directory, submit):
    """Create a single xml file "collection.xml"
    that contains all the records."""
    subject = "Consyn harvest results: %s" % \
              (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),)
    batch = 1
    counter = 1
    date = datetime.now().strftime("%Y.%m.%d")
    files_to_upload = []
    collection = None
    for filename in new_files:
        if counter == 1:
            filepath = "elsevier-%s-%s.xml" % (date, batch)
            filepath = join(directory, filepath)
            filepath = filepath.lstrip()
            collection = open(filepath, 'w')
            collection.write("<collection>\n")
        with open(filename) as f:
            collection.write(f.read() + '\n')
            counter += 1
        if counter == batch_size:
            collection.write("</collection>")
            collection.close()
            files_to_upload.append(filepath)
            counter = 1
            batch += 1
    if counter < batch_size and collection:
        collection.write("</collection>")
        collection.close()
        files_to_upload.append(filepath)
    body = ['From %s sources, found and converted %s records' %
            (len(new_sources), len(new_files)),
            '\t%s records ready to upload:\n' %
            ((batch - 1) * 500 + counter,)]
    if submit:
        body += ['\tFiles uploaded to Server:']
        for filepath in files_to_upload:
            try:
                submit_records_via_ftp(filepath)
                filename = filepath.split('/')[-1]
                body.append("\t%s (%s records)" % (filename, batch_size))
            except:
                _errors_detected.append(Exception(
                    "Failed to upload %s to FTP server" % filepath)
                )
                write_message("Failed to upload %s to FTP server" % filepath)
    else:
        body += ['\tFiles ready for upload:']
        for filename in files_to_upload:
            body.append("\t%s (%s records)" % (filename, batch_size))
    if len(body) > 3:
        #update the last line of the message
        body[-1] = "\t%s (%s records)" % (filename, counter)
        body = '\n'.join(body)
        write_message(subject)
        write_message(body)
    else:
        write_message(subject)
        write_message("No new files!")
    if submit:
        if submit_records_via_mail(subject, body, CFG_CONSYNHARVEST_EMAIL):
            write_message("Mail sent to %r" % (CFG_CONSYNHARVEST_EMAIL,))
        else:
            write_message("ERROR: Cannot send mail.")
Ejemplo n.º 8
0
    def submit_records(self, records_filename, mode, update=False,
                       taskid=0, silent=False):
        """
        Performs the logic to submit given file (filepath) of records
        either by e-mail or using BibUpload with given mode.

        Taskid is given to indicate if the task submission should wait for any
        previously submitted tasks.

        The submission can also be made "silent" in the sense of not
        updating the modification date of the records.

        @param records_filename: filepath to XML file containing records.
        @type records_filename: string

        @param records_list: list of APSRecord objects for records
        @type records_list: list

        @param mode: which submission mode is it?
        @type mode: string

        @param taskid: bibsched taskid, wait for task to complete before submission
        @type taskid: int

        @param silent: do not update the modification date of the records
        @type silent: bool

        @return: returns the given taskid upon submission, or True/False from email.
        """
        if update:
            records_list = self.records_to_update
        else:
            records_list = self.records_to_insert

        # Check if we should create bibupload or e-mail
        if mode == "email":
            # Lets parse the records and find our IDs.
            list_of_dois = []
            for record in records_list:
                # We strip away the first part of the DOI for readability.
                list_of_dois.append('/'.join(record.doi.split('/')[1:]))
            # We send an e-mail to CFG_APSHARVEST_EMAIL and put file on AFS.
            body = "Harvested new records: %s" % (records_filename,)
            try:
                try:
                    shutil.move(records_filename, self.out_folder)
                    records_filename = os.path.join(self.out_folder,
                                                    os.path.basename(records_filename))
                    body = "Harvested new records on %s. They are located here:\n %s" % \
                           (self.date_started.strftime("%Y-%m-%d %H:%M:%S"), records_filename)
                except IOError, e:
                    # Some IOError
                    body = "Error while harvesting records: \nError saving %s - %s" % \
                           (records_filename, str(e))
                    raise e
            finally:
                submit_records_via_ftp(records_filename)
                body = "%s\nRecords harvested (%s total):\n%s\n" % (body,
                                                                    str(len(list_of_dois)),
                                                                    "\n".join(list_of_dois))

                body = "%s\nUploaded to FTP: %s" % (
                    body,
                    os.path.basename(records_filename)
                )

                res = submit_records_via_mail(self.mail_subject, body, CFG_APSHARVEST_EMAIL)
                write_message("Sent e-mail to %s with path to %s" %
                              (CFG_APSHARVEST_EMAIL, records_filename))
                return res
        else:
            # We submit a BibUpload task and wait for it to finish
            task_update_progress("Waiting for task to finish")

            if taskid != 0:
                write_message("Going to wait for %d to finish" % (taskid,))

            while not can_launch_bibupload(taskid):
                # Lets wait until the previously launched task exits.
                task_sleep_now_if_required(can_stop_too=False)
                time.sleep(5.0)

            taskid = submit_bibupload_for_records(mode, records_filename, silent)
            write_message("Submitted BibUpload task #%s with mode %s" %
                         (str(taskid), mode))
            return taskid
Ejemplo n.º 9
0
    def process_record_submission(self, parameters):
        """Run the submission process."""
        if parameters.get("match"):
            # We will do a simple match with the database
            new_records, existing_records = self.check_records()
            self.records_to_insert.extend(new_records)
            self.records_to_update.extend(existing_records)
        else:
            # We insert everything
            self.records_to_insert.extend(self.records_harvested)

        if self.records_to_insert:
            # Submit new records
            record_filename = generate_xml_for_records(
                self.records_to_insert,
                self.out_folder,
                prefix=self.get_file_prefix(parameters),
                suffix="_insert.xml"
            )

            if not parameters.get("devmode"):
                taskid = self.submit_records(record_filename,
                                             parameters.get("new_mode"))
                if not taskid:
                    # Something went wrong
                    err_string = "New records (%s)" \
                                 " were not submitted correctly" % \
                                 (record_filename,)
                    raise APSHarvesterSubmissionError(err_string)
            self.records_to_insert = []

        if self.records_to_update:
            # Submit new records
            record_filename = generate_xml_for_records(
                self.records_to_update,
                self.out_folder,
                prefix=self.get_file_prefix(parameters),
                suffix="_update.xml"
            )

            if not parameters.get("devmode"):
                taskid = self.submit_records(record_filename,
                                             parameters.get("update_mode"),
                                             update=True,
                                             silent=parameters.get("records") and True or False,)
                if not taskid:
                    # Something went wrong
                    err_string = "Existing records (%s)" \
                                 " were not submitted correctly" % \
                                 (record_filename,)
                    raise APSHarvesterSubmissionError(err_string)
            self.records_to_update = []

        if self.records_failed:
            body = "\n".join(["%s failed with error: %s"
                              % (rec.doi or rec.recid, msg)
                              for rec, msg in self.records_failed])
            if not parameters.get("devmode"):
                submit_records_via_mail(subject="%s (failed records)" % (self.mail_subject,),
                                        body=body,
                                        toaddr=CFG_APSHARVEST_EMAIL)
Ejemplo n.º 10
0
def create_collection(batch_size, new_files, new_sources, directory,
                      upload_FTP):
    """Create a single xml file "collection.xml"
    that contains all the records."""
    subject = "Consyn harvest results: %s" % \
              (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),)
    if new_files:
        batch = 1
        counter = 0
        date = datetime.now().strftime("%Y.%m.%d")
        filepath = "elsevier-%s-%s.xml" % (date, batch)
        filepath = join(directory, filepath)
        filepath = filepath.lstrip()
        files_to_upload = []
        with open(filepath, 'w') as collection:
            collection.write("<collection>\n")
            for f in new_files:
                if counter == batch_size:
                    counter = 0
                    batch += 1
                    collection.write("</collection>")
                    collection.close()
                    files_to_upload.append(filepath)
                    filepath = "elsevier-%s-%s.xml" % (date, batch)
                    filepath = join(directory, filepath).lstrip()
                    filepath = filepath.lstrip()
                    collection = open(filepath, 'w')
                    collection.write("<collection>\n")
                xmlFile = open(f, 'r')
                xmlString = xmlFile.read()
                xmlFile.close()
                collection.write(xmlString + '\n')
                counter += 1
            collection.write("</collection>")
            files_to_upload.append(filepath)
        body = [
            'From %s sources, found and converted %s records' %
            (len(new_sources), len(new_files)),
            '\t%s records ready to upload:\n' % ((batch - 1) * 500 + counter, )
        ]
        if upload_FTP:
            body += ['\tFiles uploaded to Server:']
        else:
            body += ['\tFiles ready for upload:']
        for filepath in files_to_upload:
            try:
                submit_records_via_ftp(filepath)
                filename = filepath.split('/')[-1]
                body.append("\t%s (%s records)" % (filename, batch_size))
            except:
                write_message("Failed to upload %s to FTP server" % filepath)
        if len(body) > 3:
            #update the last line of the message
            body[-1] = "\t%s (%s records)" % (filename, counter)
            body = '\n'.join(body)

        write_message(subject)
        write_message(body)
        if submit_records_via_mail(subject, body, CFG_CONSYNHARVEST_EMAIL):
            write_message("Mail sent to %r" % (CFG_CONSYNHARVEST_EMAIL, ))
        else:
            write_message("ERROR: Cannot send mail.")
    else:
        write_message(subject)
        write_message("No new files")
Ejemplo n.º 11
0
def create_collection(batch_size, new_files, new_sources, directory, upload_FTP):
    """Create a single xml file "collection.xml"
    that contains all the records."""
    subject = "Consyn harvest results: %s" % (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),)
    if new_files:
        batch = 1
        counter = 0
        date = datetime.now().strftime("%Y.%m.%d")
        filepath = "elsevier-%s-%s.xml" % (date, batch)
        filepath = join(directory, filepath)
        filepath = filepath.lstrip()
        files_to_upload = []
        with open(filepath, "w") as collection:
            collection.write("<collection>\n")
            for f in new_files:
                if counter == batch_size:
                    counter = 0
                    batch += 1
                    collection.write("</collection>")
                    collection.close()
                    files_to_upload.append(filepath)
                    filepath = "elsevier-%s-%s.xml" % (date, batch)
                    filepath = join(directory, filepath).lstrip()
                    filepath = filepath.lstrip()
                    collection = open(filepath, "w")
                    collection.write("<collection>\n")
                xmlFile = open(f, "r")
                xmlString = xmlFile.read()
                xmlFile.close()
                collection.write(xmlString + "\n")
                counter += 1
            collection.write("</collection>")
            files_to_upload.append(filepath)
        body = [
            "From %s sources, found and converted %s records" % (len(new_sources), len(new_files)),
            "\t%s records ready to upload:\n" % ((batch - 1) * 500 + counter,),
        ]
        if upload_FTP:
            body += ["\tFiles uploaded to Server:"]
        else:
            body += ["\tFiles ready for upload:"]
        for filepath in files_to_upload:
            try:
                submit_records_via_ftp(filepath)
                filename = filepath.split("/")[-1]
                body.append("\t%s (%s records)" % (filename, batch_size))
            except:
                write_message("Failed to upload %s to FTP server" % filepath)
        if len(body) > 3:
            # update the last line of the message
            body[-1] = "\t%s (%s records)" % (filename, counter)
            body = "\n".join(body)

        write_message(subject)
        write_message(body)
        if submit_records_via_mail(subject, body, CFG_CONSYNHARVEST_EMAIL):
            write_message("Mail sent to %r" % (CFG_CONSYNHARVEST_EMAIL,))
        else:
            write_message("ERROR: Cannot send mail.")
    else:
        write_message(subject)
        write_message("No new files")