def create_collection(converted_files, new_files):
    """Creates the record collection file
    uploads it to the FTP server and sends
    an email to inform about the harvest"""
    target_file = "edpsciences.%s.xml" % \
                  (datetime.now().strftime("%Y-%m-%d"),)
    target_file = join(CFG_EDPSCIENCE_OUT_FOLDER, target_file)
    write_message("Creating collection file: %s" % (target_file,))
    with open(target_file, 'w') as collection:
        collection.write('<collection>\n')
        for fl in converted_files:
            recordfile = open(fl)
            collection.write(recordfile.read())
            recordfile.close()
        collection.write('\n</collection>')
    submit_records_via_ftp(target_file)
    body = ['From %s sources, found and converted %s records'
            % (len(new_files), len(converted_files)),
            '\t%s records ready to upload:\n'
            % (len(converted_files),),
            '\t%s uploaded to server:'
            % (target_file,)]
    body = '\n'.join(body)
    subject = "EDP Sciences harvest results: %s" % \
              (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),)
    write_message(body)
    if submit_records_via_mail(subject, body, CFG_SITE_SUPPORT_EMAIL):
        write_message("Mail sent to %r" % (CFG_SITE_SUPPORT_EMAIL,))
    else:
        write_message("ERROR: Cannot send mail.")
Example #2
0
def create_collection(batch_size, new_files, new_sources, directory, submit):
    """Create a single xml file "collection.xml"
    that contains all the records."""
    subject = "Consyn harvest results: %s" % \
              (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),)
    batch = 1
    counter = 1
    files_to_upload = []
    collection = None
    date = datetime.now().strftime("%Y.%m.%d")
    prefix = "elsevier-{0}".format(date)
    for filename in new_files:
        if counter == 1:
            filepath = get_available_filename(directory, prefix, batch)
            collection = open(filepath, 'w')
            collection.write("<collection>\n")
        with open(filename) as f:
            collection.write(f.read() + '\n')
            counter += 1
        if counter == batch_size:
            collection.write("</collection>")
            collection.close()
            files_to_upload.append(filepath)
            counter = 1
            batch += 1
    if counter < batch_size and collection:
        collection.write("</collection>")
        collection.close()
        files_to_upload.append(filepath)
    body = [
        'From %s sources, found and converted %s records' %
        (len(new_sources), len(new_files)),
        '\t%s records ready to upload:\n' % ((batch - 1) * 500 + counter, )
    ]
    if submit:
        body += ['\tFiles uploaded to Server:']
        for filepath in files_to_upload:
            try:
                submit_records_via_ftp(filepath)
                filename = filepath.split('/')[-1]
                body.append("\t%s (%s records)" % (filename, batch_size))
            except:
                _errors_detected.append(
                    Exception("Failed to upload %s to FTP server" % filepath))
                write_message("Failed to upload %s to FTP server" % filepath)
    else:
        body += ['\tFiles ready for upload:']
        for filename in files_to_upload:
            body.append("\t%s (%s records)" % (filename, batch_size))
    if files_to_upload:
        body = '\n'.join(body)
        write_message(subject)
        write_message(body)
        if submit:
            if submit_records_via_mail(subject, body, CFG_CONSYNHARVEST_EMAIL):
                write_message("Mail sent to %r" % (CFG_CONSYNHARVEST_EMAIL, ))
            else:
                write_message("ERROR: Cannot send mail.")
    else:
        write_message("No new files!")
Example #3
0
def create_collection(batch_size, new_files, new_sources,
                      directory, upload_FTP):
    """Create a single xml file "collection.xml"
    that contains all the records."""
    subject = "Consyn harvest results: %s" % \
              (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),)
    if new_files:
        batch = 1
        counter = 0
        date = datetime.now().strftime("%Y.%m.%d")
        filepath = "elsevier-%s-%s.xml" % (date, batch)
        filepath = join(directory, filepath)
        filepath = filepath.lstrip()
        files_to_upload = []
        with open(filepath, 'w') as collection:
            collection.write("<collection>\n")
            for f in new_files:
                if counter == batch_size:
                    counter = 0
                    batch += 1
                    collection.write("</collection>")
                    collection.close()
                    files_to_upload.append(filepath)
                    filepath = "elsevier-%s-%s.xml" % (date, batch)
                    filepath = join(directory, filepath)
                    filepath = filepath.lstrip()
                    collection = open(filepath, 'w')
                    collection.write("<collection>\n")
                xmlFile = open(f, 'r')
                xmlString = xmlFile.read()
                xmlFile.close()
                collection.write(xmlString + '\n')
                counter += 1
            collection.write("</collection>")
            files_to_upload.append(filepath)
        body = ['From %s sources, found and converted %s records' % (len(new_sources), len(new_files)),
                '\t%s records ready to upload:\n' % ((batch - 1) * 500 + counter,)]
        if upload_FTP:
            body += ['\tFiles uploaded to Server:']
        else:
            body += ['\tFiles ready for upload:']
        for filepath in files_to_upload:
            try:
                submit_records_via_ftp(filepath)
                filename = filepath.split('/')[-1]
                body.append("\t%s (%s records)" % (filename, batch_size))
            except:
                write_message("Failed to upload %s to FTP server" % filepath)
        if len(body) > 3:
            #update the last line of the message
            body[-1] = "\t%s (%s records)" % (filename, counter)
            body = '\n'.join(body)

        write_message(subject)
        write_message(body)
        report_records_via_mail(subject, body)
    else:
        write_message(subject)
        write_message("No new files")
Example #4
0
    def submit_records(self, records_filename, mode, update=False,
                       taskid=0, silent=False):
        """
        Performs the logic to submit given file (filepath) of records
        either by e-mail or using BibUpload with given mode.

        Taskid is given to indicate if the task submission should wait for any
        previously submitted tasks.

        The submission can also be made "silent" in the sense of not
        updating the modification date of the records.

        @param records_filename: filepath to XML file containing records.
        @type records_filename: string

        @param records_list: list of APSRecord objects for records
        @type records_list: list

        @param mode: which submission mode is it?
        @type mode: string

        @param taskid: bibsched taskid, wait for task to complete before submission
        @type taskid: int

        @param silent: do not update the modification date of the records
        @type silent: bool

        @return: returns the given taskid upon submission, or True/False from email.
        """
        if update:
            records_list = self.records_to_update
        else:
            records_list = self.records_to_insert

        # Check if we should create bibupload or e-mail
        if mode == "email":
            # Lets parse the records and find our IDs.
            list_of_dois = []
            for record in records_list:
                # We strip away the first part of the DOI for readability.
                list_of_dois.append('/'.join(record.doi.split('/')[1:]))
            # We send an e-mail to CFG_APSHARVEST_EMAIL and put file on AFS.
            body = "Harvested new records: %s" % (records_filename,)
            try:
                try:
                    shutil.move(records_filename, self.out_folder)
                    records_filename = os.path.join(self.out_folder,
                                                    os.path.basename(records_filename))
                    body = "Harvested new records on %s. They are located here:\n %s" % \
                           (self.date_started.strftime("%Y-%m-%d %H:%M:%S"), records_filename)
                except IOError, e:
                    # Some IOError
                    body = "Error while harvesting records: \nError saving %s - %s" % \
                           (records_filename, str(e))
                    raise e
            finally:
                submit_records_via_ftp(records_filename)
                body = "%s\nRecords harvested (%s total):\n%s\n" % (body,
                                                                    str(len(list_of_dois)),
                                                                    "\n".join(list_of_dois))

                body = "%s\nUploaded to FTP: %s" % (
                    body,
                    os.path.basename(records_filename)
                )

                res = submit_records_via_mail(self.mail_subject, body, CFG_APSHARVEST_EMAIL)
                write_message("Sent e-mail to %s with path to %s" %
                              (CFG_APSHARVEST_EMAIL, records_filename))
                return res
        else:
            # We submit a BibUpload task and wait for it to finish
            task_update_progress("Waiting for task to finish")

            if taskid != 0:
                write_message("Going to wait for %d to finish" % (taskid,))

            while not can_launch_bibupload(taskid):
                # Lets wait until the previously launched task exits.
                task_sleep_now_if_required(can_stop_too=False)
                time.sleep(5.0)

            taskid = submit_bibupload_for_records(mode, records_filename, silent)
            write_message("Submitted BibUpload task #%s with mode %s" %
                         (str(taskid), mode))
            return taskid
def main(args):
    if len(args) != 1:
        print("usage: python bibfilter_oaipos2inspire.py input_filename")
        raise Exception("Wrong usage!!")
    input_filename = args[0]

    out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY)

    insert_records = []
    append_records = []
    error_records = []
    files_uploaded = []

    pos = PosPackage()
    xml_doc = parse(input_filename)
    for record in xml_doc.getElementsByTagName('record'):
        rec = pos.get_record(record)
        identifier = pos.get_identifier()
        conference = identifier.split(':')[2]
        conference = conference.split('/')[0]
        contribution = identifier.split(':')[2]
        contribution = contribution.split('/')[1]
        identifier = "PoS(%s)%s" % (conference, contribution)
        query = "773__p:pos 773__v:%s 773__c:%s" % \
                (conference.replace(' ', ''), contribution)
        print("Querying with: %s" % (query, ))
        results = perform_request_search(p=query, of="id")

        #harvest fulltext
        url = base_url + identifier
        session = requests.session()
        r = session.get(url)
        parsed_html = BeautifulSoup(r.text)
        links = parsed_html.body.findAll('a')
        found = False

        for link in links:
            url = urllib.quote(link['href'], safe=":/")
            if url.endswith('.pdf'):
                found = True
                if results:
                    rec = create_record()
                filename = join(out_folder, identifier + ".pdf")
                record_add_field(rec,
                                 '856',
                                 ind1='4',
                                 subfields=[('u', url), ('y', 'PoS server')])
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', filename), ('t', 'PoS'),
                                            ('d', 'Fulltext')])
                try:
                    print('Downloading ' + url)
                    download_url(url, "pdf", filename, 5, 60.0)
                    if results:
                        recid = results[0]
                        record_add_field(rec, '001', controlfield_value=recid)
                        append_records.append(rec)
                    else:
                        insert_records.append(rec)
                except InvenioFileDownloadError:
                    print("Download of %s failed" % (url, ))
                break
        if not found:
            error_records.append(rec)

        #upload to FTP
        tempfile_path = '/tmp/%s.xml' % (contribution, )
        with open(tempfile_path, 'w') as tempfile:
            tempfile.write(record_xml_output(rec))
        try:
            submit_records_via_ftp(tempfile_path, conference)
            files_uploaded.append('%s/%s.xml' % (conference, contribution))
            write_message("%s successfully uploaded to FTP server" %
                          tempfile_path)
        except:
            write_message("Failed to upload %s to FTP server" % tempfile_path)
        remove(tempfile_path)

    insert_filename = "%s.insert.xml" % (input_filename, )
    append_filename = "%s.append.xml" % (input_filename, )
    errors_filename = "%s.errors.xml" % (input_filename, )

    created_files = []

    if write_record_to_file(insert_filename, insert_records):
        copy(insert_filename, out_folder)
        created_files.append(join(out_folder, basename(insert_filename)))
    if write_record_to_file(append_filename, append_records):
        copy(append_filename, out_folder)
        created_files.append(join(out_folder, basename(append_filename)))
    if write_record_to_file(errors_filename, error_records):
        copy(errors_filename, errors_filename)
        created_files.append(join(out_folder, basename(errors_filename)))

    total_records = len(append_records) + len(insert_records) + len(
        error_records)
    subject = "PoS Harvest results: " + datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S")
    body = """
    Total of %d records processed:

    %d new records,
    %d records already existing in the system,
    %d records that failed to retrieve the fulltext

    Location of new records:
    %s
    """ % \
           (total_records,
            len(insert_records),
            len(append_records),
            len(error_records),
            "\n".join(created_files))
    if files_uploaded:
        body += "\nFiles uploaded:"
        for fl in files_uploaded:
            body += "\n\t%s file uploaded on the FTP Server\n" % (fl, )
    write_message(subject)
    write_message(body)
    if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject,
                      body):
        print("ERROR: Mail not sent")
    else:
        print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL, ))
def main(args):
    if len(args) != 1:
        print("usage: python bibfilter_oaipos2inspire.py input_filename")
        raise Exception("Wrong usage!!")
    input_filename = args[0]

    out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY)

    insert_records = []
    append_records = []
    error_records = []
    files_uploaded = []

    pos = PosPackage()
    xml_doc = parse(input_filename)
    for record in xml_doc.getElementsByTagName('record'):
        rec = pos.get_record(record)
        identifier = pos.get_identifier()
        conference = identifier.split(':')[2]
        conference = conference.split('/')[0]
        contribution = identifier.split(':')[2]
        contribution = contribution.split('/')[1]
        identifier = "PoS(%s)%s" % (conference, contribution)
        query = "773__p:pos 773__v:%s 773__c:%s" % \
                (conference.replace(' ', ''), contribution)
        print("Querying with: %s" % (query,))
        results = perform_request_search(p=query, of="id")

        #harvest fulltext
        url = base_url + identifier
        session = requests.session()
        r = session.get(url)
        parsed_html = BeautifulSoup(r.text)
        links = parsed_html.body.findAll('a')
        found = False

        for link in links:
            url = urllib.quote(link['href'], safe=":/")
            if url.endswith('.pdf'):
                found = True
                if results:
                    rec = create_record()
                filename = join(out_folder, identifier + ".pdf")
                record_add_field(rec, '856', ind1='4', subfields=[
                    ('u', url),
                    ('y', 'PoS server')
                ])
                record_add_field(rec, 'FFT', subfields=[('a', filename),
                                                        ('t', 'PoS'),
                                                        ('d', 'Fulltext')])
                try:
                    print('Downloading ' + url)
                    download_url(url, "pdf", filename, 5, 60.0)
                    if results:
                        recid = results[0]
                        record_add_field(rec, '001', controlfield_value=recid)
                        append_records.append(rec)
                    else:
                        insert_records.append(rec)
                except InvenioFileDownloadError:
                    print("Download of %s failed" % (url,))
                break
        if not found:
            error_records.append(rec)

        #upload to FTP
        tempfile_path = '/tmp/%s.xml' % (contribution,)
        with open(tempfile_path, 'w') as tempfile:
            tempfile.write(record_xml_output(rec))
        try:
            submit_records_via_ftp(tempfile_path, conference)
            files_uploaded.append('%s/%s.xml' % (conference, contribution))
            write_message("%s successfully uploaded to FTP server" % tempfile_path)
        except:
            write_message("Failed to upload %s to FTP server" % tempfile_path)
        remove(tempfile_path)

    insert_filename = "%s.insert.xml" % (input_filename,)
    append_filename = "%s.append.xml" % (input_filename,)
    errors_filename = "%s.errors.xml" % (input_filename,)

    created_files = []

    if write_record_to_file(insert_filename, insert_records):
        copy(insert_filename, out_folder)
        created_files.append(join(out_folder, basename(insert_filename)))
    if write_record_to_file(append_filename, append_records):
        copy(append_filename, out_folder)
        created_files.append(join(out_folder, basename(append_filename)))
    if write_record_to_file(errors_filename, error_records):
        copy(errors_filename, errors_filename)
        created_files.append(join(out_folder, basename(errors_filename)))

    total_records = len(append_records) + len(insert_records) + len(error_records)
    subject = "PoS Harvest results: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    body = """
    Total of %d records processed:

    %d new records,
    %d records already existing in the system,
    %d records that failed to retrieve the fulltext

    Location of new records:
    %s
    """ % \
           (total_records,
            len(insert_records),
            len(append_records),
            len(error_records),
            "\n".join(created_files))
    if files_uploaded:
        body += "\nFiles uploaded:"
        for fl in files_uploaded:
            body += "\n\t%s file uploaded on the FTP Server\n" % (fl,)
    write_message(subject)
    write_message(body)
    if not send_email(CFG_SITE_SUPPORT_EMAIL,
                      CFG_POSHARVEST_EMAIL,
                      subject,
                      body):
        print("ERROR: Mail not sent")
    else:
        print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL,))
Example #7
0
def create_collection(batch_size, new_files, new_sources,
                      directory, submit):
    """Create a single xml file "collection.xml"
    that contains all the records."""
    subject = "Consyn harvest results: %s" % \
              (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),)
    batch = 1
    counter = 1
    date = datetime.now().strftime("%Y.%m.%d")
    files_to_upload = []
    collection = None
    for filename in new_files:
        if counter == 1:
            filepath = "elsevier-%s-%s.xml" % (date, batch)
            filepath = join(directory, filepath)
            filepath = filepath.lstrip()
            collection = open(filepath, 'w')
            collection.write("<collection>\n")
        with open(filename) as f:
            collection.write(f.read() + '\n')
            counter += 1
        if counter == batch_size:
            collection.write("</collection>")
            collection.close()
            files_to_upload.append(filepath)
            counter = 1
            batch += 1
    if counter < batch_size and collection:
        collection.write("</collection>")
        collection.close()
        files_to_upload.append(filepath)
    body = ['From %s sources, found and converted %s records' %
            (len(new_sources), len(new_files)),
            '\t%s records ready to upload:\n' %
            ((batch - 1) * 500 + counter,)]
    if submit:
        body += ['\tFiles uploaded to Server:']
        for filepath in files_to_upload:
            try:
                submit_records_via_ftp(filepath)
                filename = filepath.split('/')[-1]
                body.append("\t%s (%s records)" % (filename, batch_size))
            except:
                _errors_detected.append(Exception(
                    "Failed to upload %s to FTP server" % filepath)
                )
                write_message("Failed to upload %s to FTP server" % filepath)
    else:
        body += ['\tFiles ready for upload:']
        for filename in files_to_upload:
            body.append("\t%s (%s records)" % (filename, batch_size))
    if len(body) > 3:
        #update the last line of the message
        body[-1] = "\t%s (%s records)" % (filename, counter)
        body = '\n'.join(body)
        write_message(subject)
        write_message(body)
    else:
        write_message(subject)
        write_message("No new files!")
    if submit:
        if submit_records_via_mail(subject, body, CFG_CONSYNHARVEST_EMAIL):
            write_message("Mail sent to %r" % (CFG_CONSYNHARVEST_EMAIL,))
        else:
            write_message("ERROR: Cannot send mail.")
Example #8
0
    def submit_records(self, records_filename, mode, update=False,
                       taskid=0, silent=False):
        """
        Performs the logic to submit given file (filepath) of records
        either by e-mail or using BibUpload with given mode.

        Taskid is given to indicate if the task submission should wait for any
        previously submitted tasks.

        The submission can also be made "silent" in the sense of not
        updating the modification date of the records.

        @param records_filename: filepath to XML file containing records.
        @type records_filename: string

        @param records_list: list of APSRecord objects for records
        @type records_list: list

        @param mode: which submission mode is it?
        @type mode: string

        @param taskid: bibsched taskid, wait for task to complete before submission
        @type taskid: int

        @param silent: do not update the modification date of the records
        @type silent: bool

        @return: returns the given taskid upon submission, or True/False from email.
        """
        if update:
            records_list = self.records_to_update
        else:
            records_list = self.records_to_insert

        # Check if we should create bibupload or e-mail
        if mode == "email":
            # Lets parse the records and find our IDs.
            list_of_dois = []
            for record in records_list:
                # We strip away the first part of the DOI for readability.
                list_of_dois.append('/'.join(record.doi.split('/')[1:]))
            # We send an e-mail to CFG_APSHARVEST_EMAIL and put file on AFS.
            body = "Harvested new records: %s" % (records_filename,)
            try:
                try:
                    shutil.move(records_filename, self.out_folder)
                    records_filename = os.path.join(self.out_folder,
                                                    os.path.basename(records_filename))
                    body = "Harvested new records on %s. They are located here:\n %s" % \
                           (self.date_started.strftime("%Y-%m-%d %H:%M:%S"), records_filename)
                except IOError, e:
                    # Some IOError
                    body = "Error while harvesting records: \nError saving %s - %s" % \
                           (records_filename, str(e))
                    raise e
            finally:
                submit_records_via_ftp(records_filename)
                body = "%s\nRecords harvested (%s total):\n%s\n" % (body,
                                                                    str(len(list_of_dois)),
                                                                    "\n".join(list_of_dois))

                body = "%s\nUploaded to FTP: %s" % (
                    body,
                    os.path.basename(records_filename)
                )

                res = submit_records_via_mail(self.mail_subject, body, CFG_APSHARVEST_EMAIL)
                write_message("Sent e-mail to %s with path to %s" %
                              (CFG_APSHARVEST_EMAIL, records_filename))
                return res
        else:
            # We submit a BibUpload task and wait for it to finish
            task_update_progress("Waiting for task to finish")

            if taskid != 0:
                write_message("Going to wait for %d to finish" % (taskid,))

            while not can_launch_bibupload(taskid):
                # Lets wait until the previously launched task exits.
                task_sleep_now_if_required(can_stop_too=False)
                time.sleep(5.0)

            taskid = submit_bibupload_for_records(mode, records_filename, silent)
            write_message("Submitted BibUpload task #%s with mode %s" %
                         (str(taskid), mode))
            return taskid
Example #9
0
def create_collection(batch_size, new_files, new_sources, directory,
                      upload_FTP):
    """Create a single xml file "collection.xml"
    that contains all the records."""
    subject = "Consyn harvest results: %s" % \
              (datetime.now().strftime("%Y-%m-%d %H:%M:%S"),)
    if new_files:
        batch = 1
        counter = 0
        date = datetime.now().strftime("%Y.%m.%d")
        filepath = "elsevier-%s-%s.xml" % (date, batch)
        filepath = join(directory, filepath)
        filepath = filepath.lstrip()
        files_to_upload = []
        with open(filepath, 'w') as collection:
            collection.write("<collection>\n")
            for f in new_files:
                if counter == batch_size:
                    counter = 0
                    batch += 1
                    collection.write("</collection>")
                    collection.close()
                    files_to_upload.append(filepath)
                    filepath = "elsevier-%s-%s.xml" % (date, batch)
                    filepath = join(directory, filepath).lstrip()
                    filepath = filepath.lstrip()
                    collection = open(filepath, 'w')
                    collection.write("<collection>\n")
                xmlFile = open(f, 'r')
                xmlString = xmlFile.read()
                xmlFile.close()
                collection.write(xmlString + '\n')
                counter += 1
            collection.write("</collection>")
            files_to_upload.append(filepath)
        body = [
            'From %s sources, found and converted %s records' %
            (len(new_sources), len(new_files)),
            '\t%s records ready to upload:\n' % ((batch - 1) * 500 + counter, )
        ]
        if upload_FTP:
            body += ['\tFiles uploaded to Server:']
        else:
            body += ['\tFiles ready for upload:']
        for filepath in files_to_upload:
            try:
                submit_records_via_ftp(filepath)
                filename = filepath.split('/')[-1]
                body.append("\t%s (%s records)" % (filename, batch_size))
            except:
                write_message("Failed to upload %s to FTP server" % filepath)
        if len(body) > 3:
            #update the last line of the message
            body[-1] = "\t%s (%s records)" % (filename, counter)
            body = '\n'.join(body)

        write_message(subject)
        write_message(body)
        if submit_records_via_mail(subject, body, CFG_CONSYNHARVEST_EMAIL):
            write_message("Mail sent to %r" % (CFG_CONSYNHARVEST_EMAIL, ))
        else:
            write_message("ERROR: Cannot send mail.")
    else:
        write_message(subject)
        write_message("No new files")