def test_content_type(self):
        """ Test simple calls to download_url """
        tmpdoc = download_url("http://duckduckgo.com", content_type="html")
        self.assertTrue(tmpdoc)

        fun = lambda: download_url("http://google.com", content_type="pdf")
        self.assertRaises(InvenioFileDownloadError, fun)
    def test_content_type(self):
        """ Test simple calls to download_url """
        tmpdoc = download_url("http://duckduckgo.com", content_type="html")
        self.assertTrue(tmpdoc)

        fun = lambda: download_url("http://google.com", content_type="pdf")
        self.assertRaises(InvenioFileDownloadError, fun)
def old_URL_harvest(from_date, to_date, to_dir, area):
    """
        Grab all the PDFs and tarballs off arXiv between from_date and to_date,
        where from_date and to_date are in YYMM form, and put them in their own
        separate folders inside of to_dir.  Folder hierarchy will be
            to_dir/YYYY/MM/arXiv_id/stuff_downloaded_from_arXiv
        this obeys the old URL format

        @param: from_date (int): YYMM form of the date where we want to start
            harvesting
        @param: to_date (int): YYMM form of the date where we want to stop
            harvesting
        @param: to_dir (string): the base directory to put all these subdirs in
        @param: area (int): the index in the HEP_AREAS array of the area we are
            currently working on downloading

        @output: PDFs and tarballs from arXiv in a hierarchy rooted at to_dir
        @return: None
    """

    yearmonthindex = from_date

    while yearmonthindex < to_date:

        sub_dir = make_useful_directories(yearmonthindex, to_dir)

        for paperindex in range(1, 1000):
            # for whatever reason, we can't count on these things to
            # start at 1 (in HEP_PH from 9403 to CENTURY_END only).
            # they start at frickin 202.
            #if area == HEP_PH and yearmonthindex < ARBITRARY_FROM_INDEX:
            #   paperindex = paperindex + 201
            # of note: before the URL change happened in 0704, it was
            # also the case that the paper numbers only had 3 digits
            next_to_harvest = '%04d%03d' % (yearmonthindex, paperindex)
            arXiv_id = area[AREA_STRING_INDEX] + next_to_harvest
            individual_dir = make_single_directory(sub_dir, arXiv_id)

            full_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       area[URL] + next_to_harvest
            abs_path = os.path.join(individual_dir,
                                    area[AREA_STRING_INDEX] + next_to_harvest)
            if not download_url(url=full_url,
                                content_type='tar',
                                download_to_file=abs_path):
                break
            full_pdf_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                           area[URL] + next_to_harvest
            abs_path = os.path.join(
                individual_dir,
                area[AREA_STRING_INDEX] + next_to_harvest + PDF_EXTENSION)
            download_url(url=full_pdf_url,
                         content_type='pdf',
                         download_to_file=abs_path)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT)
        if yearmonthindex % 100 == 12:
            # we reached the end of the year!
            yearmonthindex = yearmonthindex + FIX_FOR_YEAR_END
        yearmonthindex = yearmonthindex + 1
Esempio n. 4
0
def new_URL_harvest(from_date, from_index, to_dir):
    """
        Grab all the PDFs and tarballs off arXiv between from_date and to_date,
        where from_date and to_date are in YYMM form, and put them in their own
        separate folders inside of to_dir.  Folder hierarchy will be
            to_dir/YYYY/MM/arXiv_id/stuff_downloaded_from_arXiv
        this obeys the new URL format

        @param: from_date (int): YYMM form of the date where we want to start
            harvesting
        @param: to_date (int): YYMM form of the date where we want to stop
            harvesting
        @param: to_dir (string): the base directory to put all these subdirs in

        @output: PDFs and tarballs from arXiv in a hierarchy rooted at to_dir
        @return: None
    """

    global current_yearmonth
    yearmonthindex = from_date

    while yearmonthindex < current_yearmonth:

        if yearmonthindex == from_date:
            fro = from_index
        else:
            fro = 1

        sub_dir = make_useful_directories(yearmonthindex, to_dir)

        for paperindex in range(fro, 10000):

            # of note: after the URL change happened in 0704, it was
            # the case that paper numbers had 4 digits
            next_to_harvest = '%04d.%04d' % (yearmonthindex, paperindex)
            arXiv_id = ARXIV_HEADER + next_to_harvest
            individual_dir = make_single_directory(sub_dir, arXiv_id)

            full_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       next_to_harvest
            abs_path = os.path.join(individual_dir, ARXIV_HEADER + next_to_harvest)
            if not download_url(url=full_url,
                                content_type='tar',
                                download_to_file=abs_path):
                break

            full_pdf_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                           next_to_harvest
            abs_path = os.path.join(individual_dir, ARXIV_HEADER + next_to_harvest + PDF_EXTENSION)
            download_url(url=full_pdf_url,
                         content_type='pdf',
                         download_to_file=abs_path)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) # be nice to remote server

        if yearmonthindex % 100 == 12:
            # we reached the end of the year!
            yearmonthindex = yearmonthindex + FIX_FOR_YEAR_END
        yearmonthindex = yearmonthindex + 1
Esempio n. 5
0
def new_URL_harvest(from_date, from_index, to_dir):
    """
        Grab all the PDFs and tarballs off arXiv between from_date and to_date,
        where from_date and to_date are in YYMM form, and put them in their own
        separate folders inside of to_dir.  Folder hierarchy will be
            to_dir/YYYY/MM/arXiv_id/stuff_downloaded_from_arXiv
        this obeys the new URL format

        @param: from_date (int): YYMM form of the date where we want to start
            harvesting
        @param: to_date (int): YYMM form of the date where we want to stop
            harvesting
        @param: to_dir (string): the base directory to put all these subdirs in

        @output: PDFs and tarballs from arXiv in a hierarchy rooted at to_dir
        @return: None
    """

    global current_yearmonth
    yearmonthindex = from_date

    while yearmonthindex < current_yearmonth:

        if yearmonthindex == from_date:
            fro = from_index
        else:
            fro = 1

        sub_dir = make_useful_directories(yearmonthindex, to_dir)

        for paperindex in range(fro, 10000):

            # of note: after the URL change happened in 0704, it was
            # the case that paper numbers had 4 digits
            next_to_harvest = '%04d.%04d' % (yearmonthindex, paperindex)
            arXiv_id = ARXIV_HEADER + next_to_harvest
            individual_dir = make_single_directory(sub_dir, arXiv_id)

            full_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       next_to_harvest
            abs_path = os.path.join(individual_dir, ARXIV_HEADER + next_to_harvest)
            if not download_url(url=full_url,
                                content_type='tar',
                                download_to_file=abs_path):
                break

            full_pdf_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                           next_to_harvest
            abs_path = os.path.join(individual_dir, ARXIV_HEADER + next_to_harvest + PDF_EXTENSION)
            download_url(url=full_pdf_url,
                         content_type='pdf',
                         download_to_file=abs_path)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) # be nice to remote server

        if yearmonthindex % 100 == 12:
            # we reached the end of the year!
            yearmonthindex = yearmonthindex + FIX_FOR_YEAR_END
        yearmonthindex = yearmonthindex + 1
Esempio n. 6
0
def download_feed(feed, batch_size, delete_zip, new_sources,
                  directory):
    """ Get list of entries from XML document """
    xmlString = ""
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed,
                                   retry_count=5,
                                   timeout=60.0)
        try:
            result_file = open(result_path, 'r')
            xmlString = result_file.read()
        finally:
            result_file.close()
            remove(result_path)
    except InvenioFileDownloadError as err:
        write_message("URL could not be opened: %s" % (feed,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return

    dom = xml.dom.minidom.parseString(xmlString)
    entries = dom.getElementsByTagName("entry")

    # Loop through entries
    for entry in entries:
        # Get URL and filename
        fileUrl = entry.getElementsByTagName("link")[0].getAttribute("href")
        fileName = entry.getElementsByTagName("title")[0].firstChild.data

        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        # Check if file has already been fetched
        existing_files = list(locate(fileName, root=CFG_CONSYN_OUT_DIRECTORY))

        if len(existing_files) == 1:
            write_message("Not downloading %s, already found %s in %s\n" %
                          (fileUrl, existing_files[0], outFilename))
        else:
            try:
                write_message("Downloading %s to %s\n" % (fileUrl, outFilename))
                download_url(fileUrl, "zip", outFilename, 5, 60.0)
                new_sources.append(outFilename)
            except InvenioFileDownloadError as err:
                write_message("URL could not be opened: %s" % (fileUrl,))
                write_message(str(err))
                write_message(traceback.format_exc()[:-1])
                task_update_status("CERROR")
                continue
            try:
                extractAll(outFilename, delete_zip, directory)
            except BadZipfile:
                write_message("Error BadZipfile %s", (outFilename,))
                task_update_status("CERROR")
                remove(outFilename)
Esempio n. 7
0
def old_URL_harvest(from_date, to_date, to_dir, area):
    """
        Grab all the PDFs and tarballs off arXiv between from_date and to_date,
        where from_date and to_date are in YYMM form, and put them in their own
        separate folders inside of to_dir.  Folder hierarchy will be
            to_dir/YYYY/MM/arXiv_id/stuff_downloaded_from_arXiv
        this obeys the old URL format

        @param: from_date (int): YYMM form of the date where we want to start
            harvesting
        @param: to_date (int): YYMM form of the date where we want to stop
            harvesting
        @param: to_dir (string): the base directory to put all these subdirs in
        @param: area (int): the index in the HEP_AREAS array of the area we are
            currently working on downloading

        @output: PDFs and tarballs from arXiv in a hierarchy rooted at to_dir
        @return: None
    """

    yearmonthindex = from_date

    while yearmonthindex < to_date:

        sub_dir = make_useful_directories(yearmonthindex, to_dir)

        for paperindex in range(1, 1000):
            # for whatever reason, we can't count on these things to
            # start at 1 (in HEP_PH from 9403 to CENTURY_END only).
            # they start at frickin 202.
            #if area == HEP_PH and yearmonthindex < ARBITRARY_FROM_INDEX:
            #   paperindex = paperindex + 201
            # of note: before the URL change happened in 0704, it was
            # also the case that the paper numbers only had 3 digits
            next_to_harvest = '%04d%03d' % (yearmonthindex, paperindex)
            arXiv_id = area[AREA_STRING_INDEX] + next_to_harvest
            individual_dir = make_single_directory(sub_dir, arXiv_id)

            full_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       area[URL] + next_to_harvest
            abs_path = os.path.join(individual_dir, area[AREA_STRING_INDEX] + next_to_harvest)
            if not download_url(url=full_url,
                                content_type='tar',
                                download_to_file=abs_path):
                break
            full_pdf_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                           area[URL] + next_to_harvest
            abs_path = os.path.join(individual_dir, area[AREA_STRING_INDEX] + next_to_harvest + PDF_EXTENSION)
            download_url(url=full_pdf_url,
                         content_type='pdf',
                         download_to_file=abs_path)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT)
        if yearmonthindex % 100 == 12:
           # we reached the end of the year!
            yearmonthindex = yearmonthindex + FIX_FOR_YEAR_END
        yearmonthindex = yearmonthindex + 1
Esempio n. 8
0
def download_feed(feed, batch_size, delete_zip, new_sources, directory):
    """ Get list of entries from XML document """
    xmlString = ""
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed, retry_count=5, timeout=60.0)
        try:
            result_file = open(result_path, 'r')
            xmlString = result_file.read()
        finally:
            result_file.close()
            remove(result_path)
    except InvenioFileDownloadError as err:
        write_message("URL could not be opened: %s" % (feed, ))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return

    dom = parseString(xmlString)
    entries = dom.getElementsByTagName("entry")

    # Loop through entries
    for entry in entries:
        # Get URL and filename
        fileUrl = entry.getElementsByTagName("link")[0].getAttribute("href")
        fileName = entry.getElementsByTagName("title")[0].firstChild.data

        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        # Check if file has already been fetched
        existing_files = list(locate(fileName, root=CFG_CONSYN_OUT_DIRECTORY))

        if len(existing_files) == 1:
            write_message("Not downloading %s, already found %s in %s\n" %
                          (fileUrl, existing_files[0], outFilename))
        else:
            try:
                write_message("Downloading %s to %s\n" %
                              (fileUrl, outFilename))
                download_url(fileUrl, "zip", outFilename, 5, 60.0)
                new_sources.append(outFilename)
            except InvenioFileDownloadError as err:
                write_message("URL could not be opened: %s" % (fileUrl, ))
                write_message(str(err))
                write_message(traceback.format_exc()[:-1])
                task_update_status("CERROR")
                continue
            try:
                extractAll(outFilename, delete_zip, directory)
            except BadZipfile:
                write_message("Error BadZipfile %s", (outFilename, ))
                task_update_status("CERROR")
                remove(outFilename)
Esempio n. 9
0
def download_feed(feed_url, batch_size, delete_zip, new_sources,
                  directory, feed_location):
    """ Get list of entries from XML document """
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed_url,
                                   content_type="xml",
                                   download_to_file=feed_location,
                                   retry_count=5,
                                   timeout=60.0)
    except InvenioFileDownloadError as err:
        _errors_detected.append(err)
        write_message("URL could not be opened: %s" % (feed_url,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
    xml_files = []
    entries = parse_feed(result_path)
    for fileUrl, fileName in entries:
        task_sleep_now_if_required()
        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        # Check if file has already been fetched
        existing_files = list(locate(fileName, root=CFG_CONSYN_OUT_DIRECTORY))

        if len(existing_files) == 1:
            write_message("Not downloading %s, already found %s in %s\n" %
                          (fileUrl, existing_files[0], outFilename))
        else:
            fileUrl = fileUrl.replace(' ', '%20')
            try:
                write_message("Downloading %s to %s\n" % (fileUrl,
                                                          outFilename))
                download_url(fileUrl, "zip", outFilename, 5, 60.0)
                new_sources.append(outFilename)
            except InvenioFileDownloadError as err:
                _errors_detected.append(err)
                write_message("URL could not be opened: %s" % fileUrl)
                write_message(str(err))
                write_message(traceback.format_exc()[:-1])
                task_update_status("CERROR")
                continue
            try:
                xml_files.extend(extractAll(outFilename,
                                            delete_zip,
                                            directory))
            except BadZipfile:
                _errors_detected.append(err)
                write_message("Error BadZipfile %s", (outFilename,))
                task_update_status("CERROR")
                remove(outFilename)
    return xml_files
Esempio n. 10
0
def get_remote_record(recid):
    """ For a given remote record ID, we download the record XML and return
    the record in a BibRecord structure
    Parameter:
    (int) recid - record ID for remote record
    Returns: BibRecord
    """
    url = "%s/record/%d/export/xm?ot=001,035" % (REMOTE_URL, recid)
    tmp_file = ''
    try:
        bibrec = None
        tmp_file = download_url(url, retry_count=10, timeout=61.0)
        with open(tmp_file, 'r') as temp:
            content = temp.read()
            bibrec, code, errors = create_record(content)
            if code != 1 or errors:
                _print(
                    "Warning: There were errors creating BibRec structure " +
                    "from remote record #%d" % recid, 4)
        os.remove(tmp_file)
        return bibrec
    except (StandardError, InvenioFileDownloadError, HTTPError) as err:
        _print("Error: Could not download remote record #%d" % recid, 4)
        _print(str(err), 4)
        _print(traceback.format_exc(), 4)
Esempio n. 11
0
def bst_consyn_harvest(CONSYNATOMURL="https://consyn.elsevier.com/batch/atom?key=QUhvbHRrYW1wOzM0Mjc%253d"):
    """
    Task to download metadata given an ATOM feed from consyn.elsevier.com
    and a folder to store the files.

    @param CONSYNATOMURL: The URL of the atom feed to download.
    """
    if not os.path.exists(CFG_CONSYN_OUT_DIRECTORY):
        folders = CFG_CONSYN_OUT_DIRECTORY.split("/")
        folder = "/"
        for i in range(1, len(folders)):
            folder = os.path.join(folder, folders[i]).strip()
            if not os.path.exists(folder):
                os.mkdir(folder)
    try:
        run_sql("SELECT filename FROM CONSYNHARVEST")
    except:
        run_sql("CREATE TABLE CONSYNHARVEST ("
                "filename VARCHAR(100) NOT NULL PRIMARY KEY,"
                "date VARCHAR(50),"
                "size VARCHAR(30) );")
    # Get list of entries from XML document
    xmlString = ""
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_file = download_url(url=CONSYNATOMURL,
                                   retry_count=5,
                                   timeout=60.0)
        xmlString = open(result_file, 'r').read()
    except InvenioFileDownloadError, err:
        write_message("URL could not be opened: %s" % (CONSYNATOMURL,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
    def _attach_fulltext(self, rec, doi):
        url = "http://dx.doi.org/" + doi
        page = requests.get(url)
        # url after redirect
        url = page.url
        page = page.text
        parsed_uri = urlparse(url)
        domain = "{uri.scheme}://{uri.netloc}".format(uri=parsed_uri)
        page = BeautifulSoup(page)
        try:
            if "epjconf" in doi:
                div = page.body.find("div", attrs={"id": "header"})
            else:
                div = page.body.find("div", attrs={"class": "module_background files"})
            links = div.findAll("a")
        except AttributeError:
            return
        for pdf in links:
            if pdf["href"].endswith("pdf"):
                link_to_pdf = domain + pdf["href"]
                record_add_field(rec, "856", ind1="4", subfields=[("u", link_to_pdf), ("y", "EDP Sciences server")])
                try:
                    from invenio.filedownloadutils import download_url, InvenioFileDownloadError
                    from invenio.config import CFG_EDPSCIENCE_OUT_FOLDER

                    try:

                        out_folder = join(CFG_EDPSCIENCE_OUT_FOLDER, "fulltexts")
                        try:
                            makedirs(out_folder)
                            filename = join(out_folder, link_to_pdf.split("/")[-1])
                        except (IOError, OSError):
                            # Problem creating folder
                            filename = None

                        filename = download_url(
                            link_to_pdf, content_type="pdf", download_to_file=filename, retry_count=5, timeout=60.0
                        )
                        record_add_field(
                            rec, "FFT", subfields=[("a", filename), ("t", "INSPIRE-PUBLIC"), ("d", "Fulltext")]
                        )
                    except InvenioFileDownloadError as e:
                        print(e)
                except ImportError:
                    pass
Esempio n. 13
0
def download_feed(feed, batch_size, delete_zip, new_sources,
                  directory):
    """ Get list of entries from XML document """
    xmlString = ""
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed,
                                   retry_count=5,
                                   timeout=60.0)
        try:
            result_file = open(result_path, 'r')
            xmlString = result_file.read()
        finally:
            result_file.close()
            remove(result_path)
    except InvenioFileDownloadError, err:
        write_message("URL could not be opened: %s" % (feed,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
def download_feed(feed, batch_size, delete_zip, new_sources,
                  directory):
    """ Get list of entries from XML document """
    xmlString = ""
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed,
                                   retry_count=5,
                                   timeout=60.0)
        try:
            result_file = open(result_path, 'r')
            xmlString = result_file.read()
        finally:
            result_file.close()
            remove(result_path)
    except InvenioFileDownloadError, err:
        write_message("URL could not be opened: %s" % (feed,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
Esempio n. 15
0
def parse_and_download(infile, sdir):
    """
    Read the write_messageation in the input file and download the corresponding
    tarballs from arxiv.

    @param: infile (string): the name of the file to parse
    @param: sdir (string): where to put the downloaded tarballs
    """

    tarfiles = []

    tardir = os.path.join(sdir, 'tarballs')
    if not os.path.isdir(tardir):
        try:
            os.makedirs(tardir)
        except:
            write_message(sys.exc_info()[0])
            write_message('files will be loose, not in ' + tardir)
            tardir = sdir

    infile = open(infile)
    for line in infile.readlines():
        line = line.strip()
        if line.startswith('http://'):
            # hurray!
            url = line
            filename = url.split('/')[-1]
            abs_path = os.path.join(tardir, filename)
            if not download_url(url=url,
                                content_type='tar',
                                download_to_file=abs_path):
                write_message(filename + ' may already exist')
                write_message(sys.exc_info()[0])
            filename = os.path.join(tardir, filename)
            tarfiles.append(filename)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) # be nice!
        elif line.startswith('arXiv'):
            tarfiles.extend(tarballs_by_arXiv_id([line.strip()], sdir))

    return tarfiles
Esempio n. 16
0
def parse_and_download(infile, sdir):
    """
    Read the write_messageation in the input file and download the corresponding
    tarballs from arxiv.

    @param: infile (string): the name of the file to parse
    @param: sdir (string): where to put the downloaded tarballs
    """

    tarfiles = []

    tardir = os.path.join(sdir, 'tarballs')
    if not os.path.isdir(tardir):
        try:
            os.makedirs(tardir)
        except:
            write_message(sys.exc_info()[0])
            write_message('files will be loose, not in ' + tardir)
            tardir = sdir

    infile = open(infile)
    for line in infile.readlines():
        line = line.strip()
        if line.startswith('http://'):
            # hurray!
            url = line
            filename = url.split('/')[-1]
            abs_path = os.path.join(tardir, filename)
            if not download_url(url=url,
                                content_type='tar',
                                download_to_file=abs_path):
                write_message(filename + ' may already exist')
                write_message(sys.exc_info()[0])
            filename = os.path.join(tardir, filename)
            tarfiles.append(filename)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) # be nice!
        elif line.startswith('arXiv'):
            tarfiles.extend(tarballs_by_arXiv_id([line.strip()], sdir))

    return tarfiles
Esempio n. 17
0
def get_remote_record(recid):
    """ For a given remote record ID, we download the record XML and return
    the record in a BibRecord structure
    Parameter:
    (int) recid - record ID for remote record
    Returns: BibRecord
    """
    url = "%s/record/%d/export/xm?ot=001,035" % (REMOTE_URL, recid)
    tmp_file = ''
    try:
        bibrec = None
        tmp_file = download_url(url, retry_count=10, timeout=61.0)
        with open(tmp_file, 'r') as temp:
            bibrec, code, errors = create_record(temp.read())
            if code != 1 or errors:
                _print("Warning: There were errors creating BibRec structure " +
                       "from remote record #%d" % recid, 4)
        os.remove(tmp_file)
        return bibrec
    except (StandardError, InvenioFileDownloadError, HTTPError) as err:
        _print("Error: Could not download remote record #%d" % recid, 4)
        _print(err.message, 5)
Esempio n. 18
0
def apply_filter(rec):
    """ Filters the record to be compatible within Inspire
    Parameters:
     * rec - dictionary: BibRecord structure
    Returns: dictionary, BibRecord structure
    """
    # Move recid from 001 to 035 if not hidden
    cds_id = rec['001'][0][3]
    if not 'hidden' in [
            x.lower() for x in record_get_field_values(rec, "980", code="a")
    ]:
        record_add_field(rec, '035', subfields=[('9', 'CDS'), ('a', cds_id)])
    # Clear control fields
    record_strip_controlfields(rec)

    # Clear other uninteresting fields
    interesting_fields = [
        "024", "041", "035", "037", "088", "100", "110", "111", "242", "245",
        "246", "260", "269", "300", "502", "650", "653", "693", "700", "710",
        "773", "856", "520", "500", "980"
    ]
    for tag in rec.keys():
        if tag not in interesting_fields:
            record_delete_fields(rec, tag)

    # 980 Determine Collections
    collections = set([])
    for value in record_get_field_values(rec, '980', code='a'):
        if 'NOTE' in value.upper():
            collections.add('NOTE')
        if 'THESIS' in value.upper():
            collections.add('THESIS')
        if 'CONFERENCEPAPER' in value.upper():
            collections.add('ConferencePaper')

    if is_published(rec):
        collections.add("PUBLISHED")
        collections.add("CITEABLE")

    if not 'NOTE' in collections:
        # TODO: Move this to a KB
        kb = [
            'ATLAS-CONF-', 'CMS-PAS-', 'ATL-', 'CMS-DP-', 'ALICE-INT-',
            'LHCb-PUB-'
        ]
        values = record_get_field_values(rec, "088", code='a')
        for val, rep in product(values, kb):
            if val.startswith(rep):
                collections.add('NOTE')
                break

    # 980 Arxiv tag
    if record_get_field_values(rec,
                               '035',
                               filter_subfield_code="a",
                               filter_subfield_value="arXiv"):
        collections.add("arXiv")

    # 980 HEP && CORE
    collections.add('HEP')
    collections.add('CORE')

    # 980 Conference Note
    if not 'ConferencePaper' in collections:
        for value in record_get_field_values(rec, '962', code='n'):
            if value[-2:].isdigit():
                collections.add('ConferencePaper')
                break

    record_delete_fields(rec, "980")

    intnote = record_get_field_values(rec,
                                      '690',
                                      filter_subfield_code="a",
                                      filter_subfield_value='INTNOTE')
    if intnote:
        val_088 = record_get_field_values(rec, '088', filter_subfield_code="a")
        for val in val_088:
            if 'CMS' in val:
                url = ('http://weblib.cern.ch/abstract?CERN-CMS' +
                       val.split('CMS', 1)[-1])
                record_add_field(rec, '856', ind1='4', subfields=[('u', url)])

    # 041 Language
    languages = get_languages()
    language_fields = record_get_field_instances(rec, '041')
    record_delete_fields(rec, "041")
    for field in language_fields:
        subs = field_get_subfields(field)
        if 'a' in subs:
            if "eng" in subs['a']:
                continue
            new_value = translate_config(subs['a'][0], languages)
            new_subs = [('a', new_value)]
            record_add_field(rec, "041", subfields=new_subs)

    # 035 Externals
    scn_035_fields = record_get_field_instances(rec, '035')
    forbidden_values = [
        "cercer", "inspire", "xx", "cern annual report", "cmscms", "wai01"
    ]
    for field in scn_035_fields:
        subs = field_get_subfields(field)
        if '9' in subs:
            if not 'a' in subs:
                continue
            for sub in subs['9']:
                if sub.lower() in forbidden_values:
                    break
            else:
                # No forbidden values (We did not "break")
                suffixes = [s.lower() for s in subs['9']]
                if 'spires' in suffixes:
                    new_subs = [('a', 'SPIRES-%s' % subs['a'][0])]
                    record_add_field(rec, '970', subfields=new_subs)
                    continue
        if 'a' in subs:
            for sub in subs['a']:
                if sub.lower() in forbidden_values:
                    record_delete_field(rec,
                                        tag="035",
                                        field_position_global=field[4])

    rep_088_fields = record_get_field_instances(rec, '088')
    for field in rep_088_fields:
        subs = field_get_subfields(field)
        if '9' in subs:
            for val in subs['9']:
                if val.startswith('P0') or val.startswith('CM-P0'):
                    sf = [('9', 'CERN'), ('b', val)]
                    record_add_field(rec, '595', subfields=sf)
        for key, val in field[0]:
            if key in ['a', '9'] and not val.startswith('SIS-'):
                record_add_field(rec, '037', subfields=[('a', val)])
    record_delete_fields(rec, "088")

    # 037 Externals also...
    rep_037_fields = record_get_field_instances(rec, '037')
    for field in rep_037_fields:
        subs = field_get_subfields(field)
        if 'a' in subs:
            for value in subs['a']:
                if 'arXiv' in value:
                    new_subs = [('a', value), ('9', 'arXiv')]
                    for fld in record_get_field_instances(rec, '695'):
                        for key, val in field_get_subfield_instances(fld):
                            if key == 'a':
                                new_subs.append(('c', val))
                                break
                    nf = create_field(subfields=new_subs)
                    record_replace_field(rec, '037', nf, field[4])
        for key, val in field[0]:
            if key in ['a', '9'] and val.startswith('SIS-'):
                record_delete_field(rec, '037', field_position_global=field[4])

    for field in record_get_field_instances(rec, '242'):
        record_add_field(rec, '246', subfields=field[0])
    record_delete_fields(rec, '242')

    # 269 Date normalization
    for field in record_get_field_instances(rec, '269'):
        for idx, (key, value) in enumerate(field[0]):
            if key == "c":
                field[0][idx] = ("c", convert_date_to_iso(value))
                record_delete_fields(rec, "260")

    if not 'THESIS' in collections:
        for field in record_get_field_instances(rec, '260'):
            record_add_field(rec, '269', subfields=field[0])
        record_delete_fields(rec, '260')

    # 300 page number
    for field in record_get_field_instances(rec, '300'):
        for idx, (key, value) in enumerate(field[0]):
            if key == 'a':
                if "mult." not in value and value != " p":
                    field[0][idx] = ('a', re.sub(r'[^\d-]+', '', value))
                else:
                    record_delete_field(rec,
                                        '300',
                                        field_position_global=field[4])
                    break

    # 100 & 700 punctuate author names
    author_names = record_get_field_instances(rec, '100')
    author_names.extend(record_get_field_instances(rec, '700'))
    for field in author_names:
        subs = field_get_subfields(field)
        if not 'i' in subs or 'XX' in subs['i']:
            if not 'j' in subs or 'YY' in subs['j']:
                for idx, (key, value) in enumerate(field[0]):
                    if key == 'a':
                        field[0][idx] = ('a', punctuate_authorname(value))

    # 700 -> 701 Thesis supervisors
    if 'THESIS' in collections:
        for field in record_get_field_instances(rec, '700'):
            record_add_field(rec, '701', subfields=field[0])
        record_delete_fields(rec, '700')

    # 501 move subfields
    fields_501 = record_get_field_instances(rec, '502')
    for idx, field in enumerate(fields_501):
        new_subs = []
        for key, value in field[0]:
            if key == 'a':
                new_subs.append(('b', value))
            elif key == 'b':
                new_subs.append(('c', value))
            elif key == 'c':
                new_subs.append(('d', value))
            else:
                new_subs.append((key, value))
        fields_501[idx] = field_swap_subfields(field, new_subs)

    # 650 Translate Categories
    categories = get_categories()
    category_fields = record_get_field_instances(rec,
                                                 '650',
                                                 ind1='1',
                                                 ind2='7')
    record_delete_fields(rec, "650")
    for field in category_fields:
        for idx, (key, value) in enumerate(field[0]):
            if key == 'a':
                new_value = translate_config(value, categories)
                if new_value != value:
                    new_subs = [('2', 'INSPIRE'), ('a', new_value)]
                else:
                    new_subs = [('2', 'SzGeCERN'), ('a', value)]
                record_add_field(rec,
                                 "650",
                                 ind1="1",
                                 ind2="7",
                                 subfields=new_subs)
                break

    # 653 Free Keywords
    for field in record_get_field_instances(rec, '653', ind1='1'):
        subs = field_get_subfields(field)
        new_subs = []
        if 'a' in subs:
            for val in subs['a']:
                new_subs.extend([('9', 'author'), ('a', val)])
        new_field = create_field(subfields=new_subs, ind1='1')
        record_replace_field(rec,
                             '653',
                             new_field,
                             field_position_global=field[4])

    experiments = get_experiments()
    # 693 Remove if 'not applicable'
    for field in record_get_field_instances(rec, '693'):
        subs = field_get_subfields(field)
        all_subs = subs.get('a', []) + subs.get('e', [])
        if 'not applicable' in [x.lower() for x in all_subs]:
            record_delete_field(rec, '693', field_position_global=field[4])
        new_subs = []
        experiment_a = ""
        experiment_e = ""
        for (key, value) in subs.iteritems():
            if key == 'a':
                experiment_a = value[0]
                new_subs.append((key, value[0]))
            elif key == 'e':
                experiment_e = value[0]
        experiment = "%s---%s" % (experiment_a.replace(" ", "-"), experiment_e)
        translated_experiments = translate_config(experiment, experiments)
        new_subs.append(("e", translated_experiments))
        record_delete_field(rec, tag="693", field_position_global=field[4])
        record_add_field(rec, "693", subfields=new_subs)

    # 710 Collaboration
    for field in record_get_field_instances(rec, '710'):
        subs = field_get_subfield_instances(field)
        for idx, (key, value) in enumerate(subs[:]):
            if key == '5':
                subs.pop(idx)
            elif value.startswith('CERN. Geneva'):
                subs.pop(idx)
        if len(subs) == 0:
            record_delete_field(rec, '710', field_position_global=field[4])

    # 773 journal translations
    journals = get_journals()
    for field in record_get_field_instances(rec, '773'):
        subs = field_get_subfield_instances(field)
        new_subs = []
        for idx, (key, value) in enumerate(subs):
            if key == 'p':
                new_subs.append((key, translate_config(value, journals)))
            else:
                new_subs.append((key, value))
        record_delete_field(rec, tag="773", field_position_global=field[4])
        record_add_field(rec, "773", subfields=new_subs)

    # FFT (856) Dealing with graphs
    figure_counter = 0
    for field in record_get_field_instances(rec, '856', ind1='4'):
        subs = field_get_subfields(field)

        newsubs = []
        remove = False

        if 'z' in subs:
            is_figure = [s for s in subs['z'] if "figure" in s.lower()]
            if is_figure and 'u' in subs:
                is_subformat = [
                    s for s in subs['u'] if "subformat" in s.lower()
                ]
                if not is_subformat:
                    url = subs['u'][0]
                    if url.endswith(".pdf"):
                        # We try to convert
                        fd, local_url = mkstemp(suffix=os.path.basename(url),
                                                dir=CFG_TMPSHAREDDIR)
                        os.close(fd)
                        _print("Downloading %s into %s" % (url, local_url),
                               verbose=5)
                        plotfile = ""
                        try:
                            plotfile = download_url(url=url,
                                                    download_to_file=local_url,
                                                    timeout=30.0)
                        except InvenioFileDownloadError:
                            _print(
                                "Download failed while attempting to reach %s. Skipping.."
                                % (url, ))
                            remove = True
                        if plotfile:
                            converted = convert_images([plotfile])
                            if converted:
                                url = converted.pop()
                                _print("Successfully converted %s to %s" %
                                       (local_url, url),
                                       verbose=5)
                            else:
                                _print("Conversion failed on %s" %
                                       (local_url, ))
                                url = None
                                remove = True
                    if url:
                        newsubs.append(('a', url))
                        newsubs.append(('t', 'Plot'))
                        figure_counter += 1
                        if 'y' in subs:
                            newsubs.append(
                                ('d',
                                 "%05d %s" % (figure_counter, subs['y'][0])))
                            newsubs.append(('n', subs['y'][0]))
                        else:
                            # Get basename without extension.
                            name = os.path.basename(
                                os.path.splitext(subs['u'][0])[0])
                            newsubs.append(
                                ('d', "%05d %s" % (figure_counter, name)))
                            newsubs.append(('n', name))

        if not newsubs and 'u' in subs:
            is_fulltext = [s for s in subs['u'] if ".pdf" in s]
            if is_fulltext:
                newsubs = [('t', 'INSPIRE-PUBLIC'), ('a', subs['u'][0])]

        if not newsubs and 'u' in subs:
            remove = True
            is_zipfile = [s for s in subs['u'] if ".zip" in s]
            if is_zipfile:
                url = is_zipfile[0]
                local_url = os.path.join(CFG_TMPSHAREDDIR,
                                         os.path.basename(url))
                _print("Downloading %s into %s" % (url, local_url), verbose=5)
                zipped_archive = ""
                try:
                    zipped_archive = download_url(url=is_zipfile[0],
                                                  download_to_file=local_url,
                                                  timeout=30.0)
                except InvenioFileDownloadError:
                    _print(
                        "Download failed while attempting to reach %s. Skipping.."
                        % (is_zipfile[0], ))
                    remove = True
                if zipped_archive:
                    unzipped_archive = unzip(zipped_archive)
                    list_of_pngs = locate("*.png", unzipped_archive)
                    for png in list_of_pngs:
                        if "_vti_" in png or "__MACOSX" in png:
                            continue
                        figure_counter += 1
                        plotsubs = []
                        plotsubs.append(('a', png))
                        caption = '%05d %s' % (figure_counter,
                                               os.path.basename(png))
                        plotsubs.append(('d', caption))
                        plotsubs.append(('t', 'Plot'))
                        record_add_field(rec, 'FFT', subfields=plotsubs)

        if not remove and not newsubs and 'u' in subs:
            urls = ('http://cdsweb.cern.ch', 'http://cms.cern.ch',
                    'http://cmsdoc.cern.ch', 'http://documents.cern.ch',
                    'http://preprints.cern.ch', 'http://cds.cern.ch')
            for val in subs['u']:
                if any(url in val for url in urls):
                    remove = True
                    break
                if val.endswith('ps.gz'):
                    remove = True

        if newsubs:
            record_add_field(rec, 'FFT', subfields=newsubs)
            remove = True

        if remove:
            record_delete_field(rec,
                                '856',
                                ind1='4',
                                field_position_global=field[4])

    # 500 - Preliminary results
    if "THESIS" not in collections:
        subs = [('a', "Preliminary results")]
        record_add_field(rec, "500", subfields=subs)

    for collection in collections:
        record_add_field(rec, '980', subfields=[('a', collection)])

    return rec
def apply_filter(rec):
    """ Filters the record to be compatible within Inspire
    Parameters:
     * rec - dictionary: BibRecord structure
    Returns: dictionary, BibRecord structure
    """
    # Move recid from 001 to 035 if not hidden
    cds_id = rec['001'][0][3]
    if not 'hidden' in [x.lower() for x in record_get_field_values(rec, "980",
                                                                   code="a")]:
        record_add_field(rec, '035', subfields=[('9', 'CDS'), ('a', cds_id)])
    # Clear control fields
    record_strip_controlfields(rec)

    # Clear other uninteresting fields
    interesting_fields = ["024", "041", "035", "037", "088", "100",
                          "110", "111", "242", "245", "246", "260",
                          "269", "300", "502", "650", "653", "693",
                          "700", "710", "773", "856", "520", "500",
                          "980"]
    for tag in rec.keys():
        if tag not in interesting_fields:
            record_delete_fields(rec, tag)

    # 980 Determine Collections
    collections = set([])
    for value in record_get_field_values(rec, '980', code='a'):
        if 'NOTE' in value.upper():
            collections.add('NOTE')
        if 'THESIS' in value.upper():
            collections.add('THESIS')
        if 'CONFERENCEPAPER' in value.upper():
            collections.add('ConferencePaper')


    if is_published(rec):
        collections.add("PUBLISHED")
        collections.add("CITEABLE")

    if not 'NOTE' in collections:
        # TODO: Move this to a KB
        kb = ['ATLAS-CONF-', 'CMS-PAS-', 'ATL-', 'CMS-DP-',
              'ALICE-INT-', 'LHCb-PUB-']
        values = record_get_field_values(rec, "088", code='a')
        for val, rep in product(values, kb):
            if val.startswith(rep):
                collections.add('NOTE')
                break

    # 980 Arxiv tag
    if record_get_field_values(rec, '035', filter_subfield_code="a",
                               filter_subfield_value="arXiv"):
        collections.add("arXiv")

    # 980 HEP && CORE
    collections.add('HEP')
    collections.add('CORE')

    # 980 Conference Note
    if not 'ConferencePaper' in collections:
        for value in record_get_field_values(rec, '962', code='n'):
            if value[-2:].isdigit():
                collections.add('ConferencePaper')
                break

    record_delete_fields(rec, "980")

    intnote = record_get_field_values(rec, '690', filter_subfield_code="a",
                                      filter_subfield_value='INTNOTE')
    if intnote:
        val_088 = record_get_field_values(rec, '088', filter_subfield_code="a")
        for val in val_088:
            if 'CMS' in val:
                url = ('http://weblib.cern.ch/abstract?CERN-CMS' +
                       val.split('CMS', 1)[-1])
                record_add_field(rec, '856', ind1='4', subfields=[('u', url)])

    # 041 Language
    languages = get_languages()
    language_fields = record_get_field_instances(rec, '041')
    record_delete_fields(rec, "041")
    for field in language_fields:
        subs = field_get_subfields(field)
        if 'a' in subs:
            if "eng" in subs['a']:
                continue
            new_value = translate_config(subs['a'][0], languages)
            new_subs = [('a', new_value)]
            record_add_field(rec, "041", subfields=new_subs)

    # 035 Externals
    scn_035_fields = record_get_field_instances(rec, '035')
    forbidden_values = ["cercer",
                        "inspire",
                        "xx",
                        "cern annual report",
                        "cmscms",
                        "wai01"]
    for field in scn_035_fields:
        subs = field_get_subfields(field)
        if '9' in subs:
            if not 'a' in subs:
                continue
            for sub in subs['9']:
                if sub.lower() in forbidden_values:
                    break
            else:
                # No forbidden values (We did not "break")
                suffixes = [s.lower() for s in subs['9']]
                if 'spires' in suffixes:
                    new_subs = [('a', 'SPIRES-%s' % subs['a'][0])]
                    record_add_field(rec, '970', subfields=new_subs)
                    continue
        if 'a' in subs:
            for sub in subs['a']:
                if sub.lower() in forbidden_values:
                    record_delete_field(rec, tag="035",
                                        field_position_global=field[4])

    rep_088_fields = record_get_field_instances(rec, '088')
    for field in rep_088_fields:
        subs = field_get_subfields(field)
        if '9' in subs:
            for val in subs['9']:
                if val.startswith('P0') or val.startswith('CM-P0'):
                    sf = [('9', 'CERN'), ('b', val)]
                    record_add_field(rec, '595', subfields=sf)
        for key, val in field[0]:
            if key in ['a', '9'] and not val.startswith('SIS-'):
                record_add_field(rec, '037', subfields=[('a', val)])
    record_delete_fields(rec, "088")

    # 037 Externals also...
    rep_037_fields = record_get_field_instances(rec, '037')
    for field in rep_037_fields:
        subs = field_get_subfields(field)
        if 'a' in subs:
            for value in subs['a']:
                if 'arXiv' in value:
                    new_subs = [('a', value), ('9', 'arXiv')]
                    for fld in record_get_field_instances(rec,  '695'):
                        for key, val in field_get_subfield_instances(fld):
                            if key == 'a':
                                new_subs.append(('c', val))
                                break
                    nf = create_field(subfields=new_subs)
                    record_replace_field(rec, '037', nf, field[4])
        for key, val in field[0]:
            if key in ['a', '9'] and val.startswith('SIS-'):
                record_delete_field(rec, '037', field_position_global=field[4])

    for field in record_get_field_instances(rec, '242'):
        record_add_field(rec, '246', subfields=field[0])
    record_delete_fields(rec, '242')

    # 269 Date normalization
    for field in record_get_field_instances(rec, '269'):
        for idx, (key, value) in enumerate(field[0]):
            if key == "c":
                field[0][idx] = ("c", convert_date_to_iso(value))
                record_delete_fields(rec, "260")

    if not 'THESIS' in collections:
        for field in record_get_field_instances(rec, '260'):
            record_add_field(rec, '269', subfields=field[0])
        record_delete_fields(rec, '260')

    # 300 page number
    for field in record_get_field_instances(rec, '300'):
        for idx, (key, value) in enumerate(field[0]):
            if key == 'a':
                if "mult." not in value and value != " p":
                    field[0][idx] = ('a', re.sub(r'[^\d-]+', '', value))
                else:
                    record_delete_field(rec, '300',
                                        field_position_global=field[4])
                    break

    # 100 & 700 punctuate author names
    author_names = record_get_field_instances(rec, '100')
    author_names.extend(record_get_field_instances(rec, '700'))
    for field in author_names:
        subs = field_get_subfields(field)
        if not 'i' in subs or 'XX' in subs['i']:
            if not 'j' in subs or 'YY' in subs['j']:
                for idx, (key, value) in enumerate(field[0]):
                    if key == 'a':
                        field[0][idx] = ('a', punctuate_authorname(value))

    # 700 -> 701 Thesis supervisors
    if 'THESIS' in collections:
        for field in record_get_field_instances(rec, '700'):
            record_add_field(rec, '701', subfields=field[0])
        record_delete_fields(rec, '700')

    # 501 move subfields
    fields_501 = record_get_field_instances(rec, '502')
    for idx, field in enumerate(fields_501):
        new_subs = []
        for key, value in field[0]:
            if key == 'a':
                new_subs.append(('b', value))
            elif key == 'b':
                new_subs.append(('c', value))
            elif key == 'c':
                new_subs.append(('d', value))
            else:
                new_subs.append((key, value))
        fields_501[idx] = field_swap_subfields(field, new_subs)

    # 650 Translate Categories
    categories = get_categories()
    category_fields = record_get_field_instances(rec, '650', ind1='1', ind2='7')
    record_delete_fields(rec, "650")
    for field in category_fields:
        for idx, (key, value) in enumerate(field[0]):
            if key == 'a':
                new_value = translate_config(value, categories)
                if new_value != value:
                    new_subs = [('2', 'INSPIRE'), ('a', new_value)]
                else:
                    new_subs = [('2', 'SzGeCERN'), ('a', value)]
                record_add_field(rec, "650", ind1="1", ind2="7",
                                 subfields=new_subs)
                break

    # 653 Free Keywords
    for field in record_get_field_instances(rec, '653', ind1='1'):
        subs = field_get_subfields(field)
        new_subs = []
        if 'a' in subs:
            for val in subs['a']:
                new_subs.extend([('9', 'author'), ('a', val)])
        new_field = create_field(subfields=new_subs, ind1='1')
        record_replace_field(rec, '653', new_field, field_position_global=field[4])

    experiments = get_experiments()
    # 693 Remove if 'not applicable'
    for field in record_get_field_instances(rec, '693'):
        subs = field_get_subfields(field)
        all_subs = subs.get('a', []) + subs.get('e', [])
        if 'not applicable' in [x.lower() for x in all_subs]:
            record_delete_field(rec, '693',
                                field_position_global=field[4])
        new_subs = []
        experiment_a = ""
        experiment_e = ""
        for (key, value) in subs.iteritems():
            if key == 'a':
                experiment_a = value[0]
                new_subs.append((key, value[0]))
            elif key == 'e':
                experiment_e = value[0]
        experiment = "%s---%s" % (experiment_a.replace(" ", "-"),
                                  experiment_e)
        translated_experiments = translate_config(experiment,
                                                  experiments)
        new_subs.append(("e", translated_experiments))
        record_delete_field(rec, tag="693",
                            field_position_global=field[4])
        record_add_field(rec, "693", subfields=new_subs)

    # 710 Collaboration
    for field in record_get_field_instances(rec, '710'):
        subs = field_get_subfield_instances(field)
        for idx, (key, value) in enumerate(subs[:]):
            if key == '5':
                subs.pop(idx)
            elif value.startswith('CERN. Geneva'):
                subs.pop(idx)
        if len(subs) == 0:
            record_delete_field(rec, '710', field_position_global=field[4])

    # 773 journal translations
    journals = get_journals()
    for field in record_get_field_instances(rec, '773'):
        subs = field_get_subfield_instances(field)
        new_subs = []
        for idx, (key, value) in enumerate(subs):
            if key == 'p':
                new_subs.append((key, translate_config(value, journals)))
            else:
                new_subs.append((key, value))
        record_delete_field(rec, tag="773",
                            field_position_global=field[4])
        record_add_field(rec, "773", subfields=new_subs)

    # FFT (856) Dealing with graphs
    figure_counter = 0
    for field in record_get_field_instances(rec, '856', ind1='4'):
        subs = field_get_subfields(field)

        newsubs = []
        remove = False

        if 'z' in subs:
            is_figure = [s for s in subs['z'] if "figure" in s.lower()]
            if is_figure and 'u' in subs:
                is_subformat = [s for s in subs['u'] if "subformat" in s.lower()]
                if not is_subformat:
                    url = subs['u'][0]
                    if url.endswith(".pdf"):
                        # We try to convert
                        fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR)
                        os.close(fd)
                        _print("Downloading %s into %s" % (url, local_url), verbose=5)
                        plotfile = ""
                        try:
                            plotfile = download_url(url=url,
                                                    download_to_file=local_url,
                                                    timeout=30.0)
                        except InvenioFileDownloadError:
                            _print("Download failed while attempting to reach %s. Skipping.." % (url,))
                            remove = True
                        if plotfile:
                            converted = convert_images([plotfile])
                            if converted:
                                url = converted.pop()
                                _print("Successfully converted %s to %s" % (local_url, url), verbose=5)
                            else:
                                _print("Conversion failed on %s" % (local_url,))
                                url = None
                                remove = True
                    if url:
                        newsubs.append(('a', url))
                        newsubs.append(('t', 'Plot'))
                        figure_counter += 1
                        if 'y' in subs:
                            newsubs.append(('d', "%05d %s" % (figure_counter, subs['y'][0])))
                            newsubs.append(('n', subs['y'][0]))
                        else:
                            # Get basename without extension.
                            name = os.path.basename(os.path.splitext(subs['u'][0])[0])
                            newsubs.append(('d', "%05d %s" % (figure_counter, name)))
                            newsubs.append(('n', name))

        if not newsubs and 'u' in subs:
            is_fulltext = [s for s in subs['u'] if ".pdf" in s and not "subformat=pdfa" in s]
            if is_fulltext:
                newsubs = [('t', 'INSPIRE-PUBLIC'), ('a', subs['u'][0])]

        if not newsubs and 'u' in subs:
            remove = True
            is_zipfile = [s for s in subs['u'] if ".zip" in s]
            if is_zipfile:
                url = is_zipfile[0]
                local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url))
                _print("Downloading %s into %s" % (url, local_url), verbose=5)
                zipped_archive = ""
                try:
                    zipped_archive = download_url(url=is_zipfile[0],
                                                  download_to_file=local_url,
                                                  timeout=30.0)
                except InvenioFileDownloadError:
                    _print("Download failed while attempting to reach %s. Skipping.."
                           % (is_zipfile[0],))
                    remove = True
                if zipped_archive:
                    unzipped_archive = unzip(zipped_archive)
                    list_of_pngs = locate("*.png", unzipped_archive)
                    for png in list_of_pngs:
                        if "_vti_" in png or "__MACOSX" in png:
                            continue
                        figure_counter += 1
                        plotsubs = []
                        plotsubs.append(('a', png))
                        caption = '%05d %s' % (figure_counter, os.path.basename(png))
                        plotsubs.append(('d', caption))
                        plotsubs.append(('t', 'Plot'))
                        record_add_field(rec, 'FFT', subfields=plotsubs)

        if not remove and not newsubs and 'u' in subs:
            urls = ('http://cdsweb.cern.ch', 'http://cms.cern.ch',
                    'http://cmsdoc.cern.ch', 'http://documents.cern.ch',
                    'http://preprints.cern.ch', 'http://cds.cern.ch')
            for val in subs['u']:
                if any(url in val for url in urls):
                    remove = True
                    break
                if val.endswith('ps.gz'):
                    remove = True

        if newsubs:
            record_add_field(rec, 'FFT', subfields=newsubs)
            remove = True

        if remove:
            record_delete_field(rec, '856', ind1='4',
                                field_position_global=field[4])

    # 500 - Preliminary results
    if "THESIS" not in collections:
        subs = [('a', "Preliminary results")]
        record_add_field(rec, "500", subfields=subs)

    for collection in collections:
        record_add_field(rec, '980', subfields=[('a', collection)])

    return rec
def main(args):
    if len(args) != 1:
        print("usage: python bibfilter_oaipos2inspire.py input_filename")
        raise Exception("Wrong usage!!")
    input_filename = args[0]

    out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY)

    insert_records = []
    append_records = []
    error_records = []
    files_uploaded = []

    pos = PosPackage()
    xml_doc = parse(input_filename)
    for record in xml_doc.getElementsByTagName('record'):
        rec = pos.get_record(record)
        identifier = pos.get_identifier()
        conference = identifier.split(':')[2]
        conference = conference.split('/')[0]
        contribution = identifier.split(':')[2]
        contribution = contribution.split('/')[1]
        identifier = "PoS(%s)%s" % (conference, contribution)
        query = "773__p:pos 773__v:%s 773__c:%s" % \
                (conference.replace(' ', ''), contribution)
        print("Querying with: %s" % (query,))
        results = perform_request_search(p=query, of="id")

        #harvest fulltext
        url = base_url + identifier
        session = requests.session()
        r = session.get(url)
        parsed_html = BeautifulSoup(r.text)
        links = parsed_html.body.findAll('a')
        found = False

        for link in links:
            url = urllib.quote(link['href'], safe=":/")
            if url.endswith('.pdf'):
                found = True
                if results:
                    rec = create_record()
                filename = join(out_folder, identifier + ".pdf")
                record_add_field(rec, '856', ind1='4', subfields=[
                    ('u', url),
                    ('y', 'PoS server')
                ])
                record_add_field(rec, 'FFT', subfields=[('a', filename),
                                                        ('t', 'PoS'),
                                                        ('d', 'Fulltext')])
                try:
                    print('Downloading ' + url)
                    download_url(url, "pdf", filename, 5, 60.0)
                    if results:
                        recid = results[0]
                        record_add_field(rec, '001', controlfield_value=recid)
                        append_records.append(rec)
                    else:
                        insert_records.append(rec)
                except InvenioFileDownloadError:
                    print("Download of %s failed" % (url,))
                break
        if not found:
            error_records.append(rec)

        #upload to FTP
        tempfile_path = '/tmp/%s.xml' % (contribution,)
        with open(tempfile_path, 'w') as tempfile:
            tempfile.write(record_xml_output(rec))
        try:
            submit_records_via_ftp(tempfile_path, conference)
            files_uploaded.append('%s/%s.xml' % (conference, contribution))
            write_message("%s successfully uploaded to FTP server" % tempfile_path)
        except:
            write_message("Failed to upload %s to FTP server" % tempfile_path)
        remove(tempfile_path)

    insert_filename = "%s.insert.xml" % (input_filename,)
    append_filename = "%s.append.xml" % (input_filename,)
    errors_filename = "%s.errors.xml" % (input_filename,)

    created_files = []

    if write_record_to_file(insert_filename, insert_records):
        copy(insert_filename, out_folder)
        created_files.append(join(out_folder, basename(insert_filename)))
    if write_record_to_file(append_filename, append_records):
        copy(append_filename, out_folder)
        created_files.append(join(out_folder, basename(append_filename)))
    if write_record_to_file(errors_filename, error_records):
        copy(errors_filename, errors_filename)
        created_files.append(join(out_folder, basename(errors_filename)))

    total_records = len(append_records) + len(insert_records) + len(error_records)
    subject = "PoS Harvest results: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    body = """
    Total of %d records processed:

    %d new records,
    %d records already existing in the system,
    %d records that failed to retrieve the fulltext

    Location of new records:
    %s
    """ % \
           (total_records,
            len(insert_records),
            len(append_records),
            len(error_records),
            "\n".join(created_files))
    if files_uploaded:
        body += "\nFiles uploaded:"
        for fl in files_uploaded:
            body += "\n\t%s file uploaded on the FTP Server\n" % (fl,)
    write_message(subject)
    write_message(body)
    if not send_email(CFG_SITE_SUPPORT_EMAIL,
                      CFG_POSHARVEST_EMAIL,
                      subject,
                      body):
        print("ERROR: Mail not sent")
    else:
        print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL,))
def apply_filter(rec):
    """ Filters the record to be compatible within Inspire
    Parameters:
     * rec - dictionary: BibRecord structure
    Returns: dictionary, BibRecord structure
    """
    # Move recid from 001 to 035 if not hidden
    cds_id = rec["001"][0][3]
    if not "hidden" in [x.lower() for x in record_get_field_values(rec, "980", code="a")]:
        record_add_field(rec, "035", subfields=[("9", "CDS"), ("a", cds_id)])
    # Clear control fields
    record_strip_controlfields(rec)

    # Clear other uninteresting fields
    interesting_fields = [
        "024",
        "041",
        "035",
        "037",
        "088",
        "100",
        "110",
        "111",
        "242",
        "245",
        "246",
        "260",
        "269",
        "300",
        "502",
        "650",
        "653",
        "693",
        "700",
        "710",
        "773",
        "856",
        "520",
        "500",
        "980",
    ]
    for tag in rec.keys():
        if tag not in interesting_fields:
            record_delete_fields(rec, tag)

    # 980 Determine Collections
    collections = set([])
    for value in record_get_field_values(rec, "980", code="a"):
        if "NOTE" in value.upper():
            collections.add("NOTE")
        if "THESIS" in value.upper():
            collections.add("THESIS")
        if "CONFERENCEPAPER" in value.upper():
            collections.add("ConferencePaper")

    if is_published(rec):
        collections.add("PUBLISHED")
        collections.add("CITEABLE")

    if not "NOTE" in collections:
        # TODO: Move this to a KB
        kb = ["ATLAS-CONF-", "CMS-PAS-", "ATL-", "CMS-DP-", "ALICE-INT-", "LHCb-PUB-"]
        values = record_get_field_values(rec, "088", code="a")
        for val, rep in product(values, kb):
            if val.startswith(rep):
                collections.add("NOTE")
                break

    # 980 Arxiv tag
    if record_get_field_values(rec, "035", filter_subfield_code="a", filter_subfield_value="arXiv"):
        collections.add("arXiv")

    # 980 HEP && CORE
    collections.add("HEP")
    collections.add("CORE")

    # 980 Conference Note
    if not "ConferencePaper" in collections:
        for value in record_get_field_values(rec, "962", code="n"):
            if value[-2:].isdigit():
                collections.add("ConferencePaper")
                break

    record_delete_fields(rec, "980")

    intnote = record_get_field_values(rec, "690", filter_subfield_code="a", filter_subfield_value="INTNOTE")
    if intnote:
        val_088 = record_get_field_values(rec, "088", filter_subfield_code="a")
        for val in val_088:
            if "CMS" in val:
                url = "http://weblib.cern.ch/abstract?CERN-CMS" + val.split("CMS", 1)[-1]
                record_add_field(rec, "856", ind1="4", subfields=[("u", url)])

    # 041 Language
    languages = get_languages()
    language_fields = record_get_field_instances(rec, "041")
    record_delete_fields(rec, "041")
    for field in language_fields:
        subs = field_get_subfields(field)
        if "a" in subs:
            if "eng" in subs["a"]:
                continue
            new_value = translate_config(subs["a"][0], languages)
            new_subs = [("a", new_value)]
            record_add_field(rec, "041", subfields=new_subs)

    # 035 Externals
    scn_035_fields = record_get_field_instances(rec, "035")
    forbidden_values = ["cercer", "inspire", "xx", "cern annual report", "cmscms", "wai01"]
    for field in scn_035_fields:
        subs = field_get_subfields(field)
        if "9" in subs:
            if not "a" in subs:
                continue
            for sub in subs["9"]:
                if sub.lower() in forbidden_values:
                    break
            else:
                # No forbidden values (We did not "break")
                suffixes = [s.lower() for s in subs["9"]]
                if "spires" in suffixes:
                    new_subs = [("a", "SPIRES-%s" % subs["a"][0])]
                    record_add_field(rec, "970", subfields=new_subs)
                    continue
        if "a" in subs:
            for sub in subs["a"]:
                if sub.lower() in forbidden_values:
                    record_delete_field(rec, tag="035", field_position_global=field[4])

    rep_088_fields = record_get_field_instances(rec, "088")
    for field in rep_088_fields:
        subs = field_get_subfields(field)
        if "9" in subs:
            for val in subs["9"]:
                if val.startswith("P0") or val.startswith("CM-P0"):
                    sf = [("9", "CERN"), ("b", val)]
                    record_add_field(rec, "595", subfields=sf)
        for key, val in field[0]:
            if key in ["a", "9"] and not val.startswith("SIS-"):
                record_add_field(rec, "037", subfields=[("a", val)])
    record_delete_fields(rec, "088")

    # 037 Externals also...
    rep_037_fields = record_get_field_instances(rec, "037")
    for field in rep_037_fields:
        subs = field_get_subfields(field)
        if "a" in subs:
            for value in subs["a"]:
                if "arXiv" in value:
                    new_subs = [("a", value), ("9", "arXiv")]
                    for fld in record_get_field_instances(rec, "695"):
                        for key, val in field_get_subfield_instances(fld):
                            if key == "a":
                                new_subs.append(("c", val))
                                break
                    nf = create_field(subfields=new_subs)
                    record_replace_field(rec, "037", nf, field[4])
        for key, val in field[0]:
            if key in ["a", "9"] and val.startswith("SIS-"):
                record_delete_field(rec, "037", field_position_global=field[4])

    for field in record_get_field_instances(rec, "242"):
        record_add_field(rec, "246", subfields=field[0])
    record_delete_fields(rec, "242")

    # 269 Date normalization
    for field in record_get_field_instances(rec, "269"):
        for idx, (key, value) in enumerate(field[0]):
            if key == "c":
                field[0][idx] = ("c", convert_date_to_iso(value))
                record_delete_fields(rec, "260")

    if not "THESIS" in collections:
        for field in record_get_field_instances(rec, "260"):
            record_add_field(rec, "269", subfields=field[0])
        record_delete_fields(rec, "260")

    # 300 page number
    for field in record_get_field_instances(rec, "300"):
        for idx, (key, value) in enumerate(field[0]):
            if key == "a":
                if "mult." not in value and value != " p":
                    field[0][idx] = ("a", re.sub(r"[^\d-]+", "", value))
                else:
                    record_delete_field(rec, "300", field_position_global=field[4])
                    break

    # 100 & 700 punctuate author names
    author_names = record_get_field_instances(rec, "100")
    author_names.extend(record_get_field_instances(rec, "700"))
    for field in author_names:
        subs = field_get_subfields(field)
        if not "i" in subs or "XX" in subs["i"]:
            if not "j" in subs or "YY" in subs["j"]:
                for idx, (key, value) in enumerate(field[0]):
                    if key == "a":
                        field[0][idx] = ("a", punctuate_authorname(value))

    # 700 -> 701 Thesis supervisors
    if "THESIS" in collections:
        for field in record_get_field_instances(rec, "700"):
            record_add_field(rec, "701", subfields=field[0])
        record_delete_fields(rec, "700")

    # 501 move subfields
    fields_501 = record_get_field_instances(rec, "502")
    for idx, field in enumerate(fields_501):
        new_subs = []
        for key, value in field[0]:
            if key == "a":
                new_subs.append(("b", value))
            elif key == "b":
                new_subs.append(("c", value))
            elif key == "c":
                new_subs.append(("d", value))
            else:
                new_subs.append((key, value))
        fields_501[idx] = field_swap_subfields(field, new_subs)

    # 650 Translate Categories
    categories = get_categories()
    category_fields = record_get_field_instances(rec, "650", ind1="1", ind2="7")
    record_delete_fields(rec, "650")
    for field in category_fields:
        for idx, (key, value) in enumerate(field[0]):
            if key == "a":
                new_value = translate_config(value, categories)
                if new_value != value:
                    new_subs = [("2", "INSPIRE"), ("a", new_value)]
                else:
                    new_subs = [("2", "SzGeCERN"), ("a", value)]
                record_add_field(rec, "650", ind1="1", ind2="7", subfields=new_subs)
                break

    # 653 Free Keywords
    for field in record_get_field_instances(rec, "653", ind1="1"):
        subs = field_get_subfields(field)
        new_subs = []
        if "a" in subs:
            for val in subs["a"]:
                new_subs.extend([("9", "author"), ("a", val)])
        new_field = create_field(subfields=new_subs, ind1="1")
        record_replace_field(rec, "653", new_field, field_position_global=field[4])

    experiments = get_experiments()
    # 693 Remove if 'not applicable'
    for field in record_get_field_instances(rec, "693"):
        subs = field_get_subfields(field)
        all_subs = subs.get("a", []) + subs.get("e", [])
        if "not applicable" in [x.lower() for x in all_subs]:
            record_delete_field(rec, "693", field_position_global=field[4])
        new_subs = []
        experiment_a = ""
        experiment_e = ""
        for (key, value) in subs.iteritems():
            if key == "a":
                experiment_a = value[0]
                new_subs.append((key, value[0]))
            elif key == "e":
                experiment_e = value[0]
        experiment = "%s---%s" % (experiment_a.replace(" ", "-"), experiment_e)
        translated_experiments = translate_config(experiment, experiments)
        new_subs.append(("e", translated_experiments))
        record_delete_field(rec, tag="693", field_position_global=field[4])
        record_add_field(rec, "693", subfields=new_subs)

    # 710 Collaboration
    for field in record_get_field_instances(rec, "710"):
        subs = field_get_subfield_instances(field)
        for idx, (key, value) in enumerate(subs[:]):
            if key == "5":
                subs.pop(idx)
            elif value.startswith("CERN. Geneva"):
                subs.pop(idx)
        if len(subs) == 0:
            record_delete_field(rec, "710", field_position_global=field[4])

    # 773 journal translations
    journals = get_journals()
    for field in record_get_field_instances(rec, "773"):
        subs = field_get_subfield_instances(field)
        new_subs = []
        for idx, (key, value) in enumerate(subs):
            if key == "p":
                new_subs.append((key, translate_config(value, journals)))
            else:
                new_subs.append((key, value))
        record_delete_field(rec, tag="773", field_position_global=field[4])
        record_add_field(rec, "773", subfields=new_subs)

    # FFT (856) Dealing with graphs
    figure_counter = 0
    for field in record_get_field_instances(rec, "856", ind1="4"):
        subs = field_get_subfields(field)

        newsubs = []
        remove = False

        if "z" in subs:
            is_figure = [s for s in subs["z"] if "figure" in s.lower()]
            if is_figure and "u" in subs:
                is_subformat = [s for s in subs["u"] if "subformat" in s.lower()]
                if not is_subformat:
                    url = subs["u"][0]
                    if url.endswith(".pdf"):
                        # We try to convert
                        fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR)
                        os.close(fd)
                        _print("Downloading %s into %s" % (url, local_url), verbose=5)
                        plotfile = ""
                        try:
                            plotfile = download_url(url=url, download_to_file=local_url, timeout=30.0)
                        except InvenioFileDownloadError:
                            _print("Download failed while attempting to reach %s. Skipping.." % (url,))
                            remove = True
                        if plotfile:
                            converted = convert_images([plotfile])
                            if converted:
                                url = converted.pop()
                                _print("Successfully converted %s to %s" % (local_url, url), verbose=5)
                            else:
                                _print("Conversion failed on %s" % (local_url,))
                                url = None
                                remove = True
                    if url:
                        newsubs.append(("a", url))
                        newsubs.append(("t", "Plot"))
                        figure_counter += 1
                        if "y" in subs:
                            newsubs.append(("d", "%05d %s" % (figure_counter, subs["y"][0])))
                            newsubs.append(("n", subs["y"][0]))
                        else:
                            # Get basename without extension.
                            name = os.path.basename(os.path.splitext(subs["u"][0])[0])
                            newsubs.append(("d", "%05d %s" % (figure_counter, name)))
                            newsubs.append(("n", name))

        if not newsubs and "u" in subs:
            is_fulltext = [s for s in subs["u"] if ".pdf" in s]
            if is_fulltext:
                newsubs = [("t", "INSPIRE-PUBLIC"), ("a", subs["u"][0])]

        if not newsubs and "u" in subs:
            remove = True
            is_zipfile = [s for s in subs["u"] if ".zip" in s]
            if is_zipfile:
                url = is_zipfile[0]
                local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url))
                _print("Downloading %s into %s" % (url, local_url), verbose=5)
                zipped_archive = ""
                try:
                    zipped_archive = download_url(url=is_zipfile[0], download_to_file=local_url, timeout=30.0)
                except InvenioFileDownloadError:
                    _print("Download failed while attempting to reach %s. Skipping.." % (is_zipfile[0],))
                    remove = True
                if zipped_archive:
                    unzipped_archive = unzip(zipped_archive)
                    list_of_pngs = locate("*.png", unzipped_archive)
                    for png in list_of_pngs:
                        if "_vti_" in png or "__MACOSX" in png:
                            continue
                        figure_counter += 1
                        plotsubs = []
                        plotsubs.append(("a", png))
                        caption = "%05d %s" % (figure_counter, os.path.basename(png))
                        plotsubs.append(("d", caption))
                        plotsubs.append(("t", "Plot"))
                        record_add_field(rec, "FFT", subfields=plotsubs)

        if not remove and not newsubs and "u" in subs:
            urls = (
                "http://cdsweb.cern.ch",
                "http://cms.cern.ch",
                "http://cmsdoc.cern.ch",
                "http://documents.cern.ch",
                "http://preprints.cern.ch",
                "http://cds.cern.ch",
            )
            for val in subs["u"]:
                if any(url in val for url in urls):
                    remove = True
                    break
                if val.endswith("ps.gz"):
                    remove = True

        if newsubs:
            record_add_field(rec, "FFT", subfields=newsubs)
            remove = True

        if remove:
            record_delete_field(rec, "856", ind1="4", field_position_global=field[4])

    # 500 - Preliminary results
    if "THESIS" not in collections:
        subs = [("a", "Preliminary results")]
        record_add_field(rec, "500", subfields=subs)

    for collection in collections:
        record_add_field(rec, "980", subfields=[("a", collection)])

    return rec
Esempio n. 22
0
def bst_consyn_harvest(feed_url=None, package=None, feed_file=None,
                       package_list_file=None, batch_size='500',
                       delete_zip='False', submit='False'):
    """ Task to convert xml files from consyn.elsevier.com to Marc xml files.
    There are four execution modes:
    1. Download from an atom feed url.
    2. Extract and convert a zip package.
    3. Download from an atom feed file.
    4. Extract and convert a list of zip packages.

    The feed is stored to the file system under the folder feeds.
    If no errors occur during the execution of the tasklet the feed
    is deleted. Records may be recovered running the tasklet again with
    the modes 2, 3 or 4.

    :param feed_url: A URL to the atom feed.
    :type feed: string.

    :param package: A path to a zip package.
    :type package: string.

    :param package: A path to an atom feed file.
    :type package: string.

    :param package_list_file: A path to a file with a list of paths
                              to zip packages. The file must contain
                              the path to each package in a different
                              line.
    :type package_list_file: string.

    :param batch_size: The number of records contained in each output file.
    :type batch_size: string representation of an integer.

    :param delete_zip: Flag to indicate if the downloaded zip files
                       should be kept on the disk or not.
    :type delete_zip: string representation of a boolean.

    :param submit: Flag to indicate whether the result files
                       should be submited by email and uploaded
                       to FTP server.
    :type submit: string representation of a boolean.
    """
    if not feed_url:
        feed_url = "https://consyn.elsevier.com/batch/atom?key=%s" % \
                   (CFG_CONSYN_ATOM_KEY,)
    new_files = []
    new_sources = []
    feed_location = ''

    try:
        batch_size = int(batch_size)
    except ValueError:
        batch_size = 500
        write_message('Warning batch_size parameter is not a valid integer\n'
                      'the default value \'500\' has been used!\n')
    if delete_zip.lower() == 'true':
        delete_zip = True
    elif delete_zip.lower() == 'false':
        delete_zip = False
    else:
        delete_zip = False
        write_message('Warning delete_zip parameter is not'
                      ' a valid Boolean (True/False)\n'
                      'the default value \'False\' has been used!\n')
    if submit.lower() == 'true':
        submit = True
    elif submit.lower() == 'false':
        submit = False
    else:
        submit = False
        write_message('Warning upload_FTP parameter is not'
                      ' a valid Boolean (True/False)\n'
                      'the default value \'False\' has been used!\n')

    if not exists(CFG_CONSYN_OUT_DIRECTORY):
        makedirs(CFG_CONSYN_OUT_DIRECTORY)
    out_folder = CFG_CONSYN_OUT_DIRECTORY
    els = ElsevierPackage(CONSYN=True)

    consyn_files = join(out_folder, "consyn-files")
    consyn_files = consyn_files.lstrip()

    if package:
        xml_files = extract_package(package, batch_size, delete_zip,
                                    out_folder, new_sources)
    elif package_list_file:
        package_list = []
        with open(package_list_file, 'r') as package_file:
            for line in package_file:
                line = line.strip()
                if line:
                    package_list.append(line)
        xml_files = extract_multiple_packages(
            package_list, batch_size,
            delete_zip, new_sources,
            out_folder
        )
    elif feed_file:
        entries = parse_feed(feed_file)
        links = map(lambda a: a[0], entries)
        package_list = map(lambda a: a[1], entries)
        package_list = map(lambda a: join(CFG_CONSYN_OUT_DIRECTORY, a),
                           package_list)
        for package in package_list:
            if not exists(package):
                index = package_list.index(package)
                link = links[index]
                try:
                    message = ("Downloading %s to %s\n" % (link,
                                                           package))
                    write_message(message)
                    download_url(link, "zip", package, 5, 60.0)
                    package_list.append(package)
                except InvenioFileDownloadError as err:
                    message = "URL could not be opened: " + link
                    write_message(message)
                    write_message(str(err))
                    write_message(traceback.format_exc()[:-1])
                    task_update_status("CERROR")
                    continue
            xml_files = extract_multiple_packages(
                package_list, batch_size,
                delete_zip, new_sources,
                out_folder
            )
    else:
        feeds_folder = join(CFG_CONSYN_OUT_DIRECTORY, 'feeds')
        if not exists(feeds_folder):
            makedirs(feeds_folder)
        date = datetime.now().strftime("%Y.%m.%d")
        feed_location = "feed-%s.xml" % date
        feed_location = join(feeds_folder, feed_location)
        xml_files = download_feed(feed_url, batch_size, delete_zip,
                                  new_sources, out_folder, feed_location)
    task_update_progress("Converting files 2/3...")
    results = convert_files(xml_files, els, prefix=consyn_files)
    for dummy, (status_code, result) in results.iteritems():
        if status_code == StatusCodes.OK:
            new_files.append(result)
    task_update_progress("Compiling output 3/3...")
    create_collection(batch_size, new_files, new_sources,
                      out_folder, submit)
    if feed_location and not _errors_detected:
        remove(feed_location)
    for error in _errors_detected:
        write_message(str(err))
Esempio n. 23
0
    def perform_fulltext_harvest(self, record_list, parameters):
        """
        For every record in given list APSRecord(record ID, DOI, date last
        updated), yield a APSRecord with added FFT dictionary containing URL to
        fulltext/metadata XML downloaded locally.

        If a download is unsuccessful, an error message is given.

        @return: tuple of (APSRecord, error_message)
        """
        count = 0
        request_end = None
        request_start = None
        for record in record_list:
            task_sleep_now_if_required(can_stop_too=False)
            # Unless this is the first request, lets sleep a bit
            if request_end and request_start:
                request_dt = request_end-request_start
                write_message("Checking request time (%d)"
                              % (request_dt,), verbose=3)
                if count and request_dt > 0 and request_dt < CFG_APSHARVEST_REQUEST_TIMEOUT:
                    write_message("Initiating sleep for %.1f seconds"
                                  % (request_dt,), verbose=3)
                    time.sleep(request_dt)

            count += 1
            task_update_progress("Harvesting record (%d/%d)" % (count,
                                                                len(record_list)))

            if not record.doi:
                msg = "No DOI found for record %d" % (record.recid or "",)
                write_message("Error: %s" % (msg,), stream=sys.stderr)
                yield record, msg
                continue

            url = CFG_APSHARVEST_FULLTEXT_URL % {'doi': record.doi}
            result_file = os.path.join(self.out_folder,
                                       "%s.zip" % (record.doi.replace('/', '_')))
            try:
                request_start = time.time()
                if os.path.exists(result_file):
                    # File already downloaded recently, lets see if it is the same
                    file_last_modified = get_file_modified_date(result_file)
                    if not compare_datetime_to_iso8601_date(file_last_modified, record.last_modified):
                        # File is not older than APS version, we should not download.
                        raise APSHarvesterFileExits

                write_message("Trying to save to %s" % (result_file,), verbose=5)

                result_file = download_url(url=url,
                                           download_to_file=result_file,
                                           content_type="zip",
                                           retry_count=5,
                                           timeout=60.0)
                write_message("Downloaded %s to %s" % (url, result_file), verbose=2)
            except InvenioFileDownloadError, e:
                msg = "URL could not be opened: %s" % (url,)
                write_message("Error: %s" % (msg,),
                              stream=sys.stderr)
                yield record, msg
                continue

            except APSHarvesterFileExits:
                write_message("File exists at %s" % (result_file,), verbose=2)
Esempio n. 24
0
def harvest_single(single, to_dir, selection=("tarball", "pdf")):
    """
    if we only want to harvest one id (arXiv or DESY), we can use this.

    @param: single (string): an id from arXiv or DESY
    @param: to_dir (string): where the output should be saved

    @output: the PDF and source tarball (if applicable) of this single record

    @return: (tarball, pdf): the location of the source tarball and PDF, None
            if not found
    """

    if single.find('arXiv') > -1 and 'arxiv.org' in CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.lower():
        id_str = re.findall('[a-zA-Z\\-]+/\\d+|\\d+\\.\\d+', single)[0]
        idno = id_str.split('/')
        if len(idno) > 0:
            idno = idno[-1]
        yymm = int(idno[:4])
        yymm_dir = make_useful_directories(yymm, to_dir)
        url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \
                       CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       id_str
        url_for_pdf = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \
                      CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                      id_str + '.pdf' # adds '.pdf' to avoid arXiv internal redirect from arXivID to arXivID.pdf
        individual_file = 'arXiv:' + id_str.replace('/', '_')
        individual_dir = make_single_directory(yymm_dir, individual_file)
        abs_path = os.path.join(individual_dir, individual_file)
        tarball = abs_path
        pdf = abs_path + '.pdf'

        try:
            if "tarball" in selection:
                write_message('downloading ' + url_for_file + ' to ' + tarball)
                tarball = download_url(url=url_for_file,
                                       content_type='tar',
                                       download_to_file=tarball)
        except InvenioFileDownloadError:
            tarball = None

        try:
            if "pdf" in selection:
                write_message('downloading ' + url_for_pdf + ' to ' + pdf)
                pdf = download_url(url=url_for_pdf,
                                   content_type="pdf",
                                   download_to_file=pdf)
        except InvenioFileDownloadError:
            pdf = None

        return (tarball, pdf)

    elif single.find('arXiv') > -1 and CFG_PLOTEXTRACTOR_SOURCE_BASE_URL != '':
        # hmm... is it a filesystem?
        if CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith('/'):
            if not os.path.exists(CFG_PLOTEXTRACTOR_SOURCE_BASE_URL):
                write_message('PROBLEM WITH CFG_PLOTEXTRACTOR_SOURCE_BASE_URL: we cannot ' + \
                        'find this folder!')
                return (None, None)
            for root, files, dummy in os.walk(CFG_PLOTEXTRACTOR_SOURCE_BASE_URL):
                for file_name in files:
                    id_no = single.replace('arXiv', '')
                    if file_name.find(id_no) > -1 or\
                       file_name.find(id_no.replace('/', '_')) > -1 or\
                       file_name.find(id_no.replace('_', '/')) > -1 or\
                       file_name.find(id_no.replace(':', '')) > -1:
                        # that's our file!  probably.
                        return (os.path.join(root, file_name), None)

            # well, no luck there
            return (None, None)

        # okay... is it... a website?
        elif CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith('http') and "tarball" in selection:
            url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + single
            individual_file = os.path.join(to_dir, single)
            abs_path = os.path.join(to_dir, individual_file)
            try:
                abs_path = download_url(url=url_for_file,
                                        content_type='tar',
                                        download_to_file=abs_path)
            except InvenioFileDownloadError:
                abs_path = None
            return (abs_path, None)

        # well, I don't know what to do with it
        else:
            write_message('unsure how to handle CFG_PLOTEXTRACTOR_SOURCE_BASE_URL. ' + \
                  'please fix the harvest_single function in ' + \
                  'miscutil/lib/plotextractor_getter.py')
            return (None, None)

    elif single.find('DESY') > -1 and "pdf" in selection:
        # also okay!
        idno = re.findall('\\d{2,4}-\\d{3}', single)[0]
        year, number = idno.split('-')
        if len(year) < 4:
            if int(year) > 92:
                year = '19' + year
            else:
                year = '20' + year
        year_dir = make_single_directory(to_dir, year)
        desy_dir = make_single_directory(year_dir, 'DESY')
        individual_dir = make_single_directory(desy_dir, number)
        id_no = year[2:] + '-' + number + '.pdf'
        url_for_file = CFG_PLOTEXTRACTOR_DESY_BASE + year + \
                       CFG_PLOTEXTRACTOR_DESY_PIECE + id_no
        individual_file = id_no
        abs_path = os.path.join(individual_dir, individual_file)
        write_message('download ' + url_for_file + ' to ' + abs_path)
        try:
            abs_path = download_url(url=url_for_file,
                                    content_type='pdf',
                                    download_to_file=abs_path)
        except InvenioFileDownloadError:
            abs_path = None
        return (None, abs_path)
    write_message('END')
    return (None, None)
def main(args):
    if len(args) != 1:
        print("usage: python bibfilter_oaipos2inspire.py input_filename")
        raise Exception("Wrong usage!!")
    input_filename = args[0]

    out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY)

    insert_records = []
    append_records = []
    error_records = []

    pos = PosPackage()
    xml_doc = parse(input_filename)
    for record in xml_doc.getElementsByTagName("record"):
        rec = pos.get_record(record)
        identifier = pos.get_identifier()
        conference = identifier.split(":")[2]
        conference = conference.split("/")[0]
        contribution = identifier.split(":")[2]
        contribution = contribution.split("/")[1]
        identifier = "PoS(%s)%s" % (conference, contribution)
        query = "773__p:pos 773__v:%s 773__c:%s" % (conference.replace(" ", ""), contribution)
        print("Querying with: %s" % (query,))
        results = perform_request_search(p=query, of="id")

        # harvest fulltext
        url = base_url + identifier
        session = requests.session()
        r = session.get(url)
        parsed_html = BeautifulSoup(r.text)
        links = parsed_html.body.findAll("a")
        found = False

        for link in links:
            url = urllib.quote(link["href"], safe=":/")
            if url.endswith(".pdf"):
                found = True
                if results:
                    rec = {}
                filename = join(out_folder, identifier + ".pdf")
                record_add_field(rec, "856", ind1="4", subfields=[("u", url), ("y", "Fulltext")])
                record_add_field(rec, "FFT", subfields=[("a", filename), ("t", "PoS"), ("d", "Fulltext")])
                try:
                    print("Downloading " + url)
                    download_url(url, "pdf", filename, 5, 60.0)
                    if results:
                        recid = results[0]
                        record_add_field(rec, "001", controlfield_value=recid)
                        append_records.append(rec)
                    else:
                        insert_records.append(rec)
                except InvenioFileDownloadError:
                    print("Download of %s failed" % (url,))
                break
        if not found:
            error_records.append(rec)

    insert_filename = "%s.insert.xml" % (input_filename,)
    append_filename = "%s.append.xml" % (input_filename,)
    errors_filename = "%s.errors.xml" % (input_filename,)

    created_files = []

    if write_record_to_file(insert_filename, insert_records):
        copy(insert_filename, out_folder)
        created_files.append(join(out_folder, basename(insert_filename)))
    if write_record_to_file(append_filename, append_records):
        copy(append_filename, out_folder)
        created_files.append(join(out_folder, basename(append_filename)))
    if write_record_to_file(errors_filename, error_records):
        copy(errors_filename, errors_filename)
        created_files.append(join(out_folder, basename(errors_filename)))

    total_records = len(append_records) + len(insert_records) + len(error_records)
    subject = "PoS Harvest results: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    body = """
    Total of %d records processed:

    %d new records,
    %d records already existing in the system,
    %d records that failed to retrieve the fulltext

    Location of new records:
    %s
    """ % (
        total_records,
        len(insert_records),
        len(append_records),
        len(error_records),
        "\n".join(created_files),
    )
    print(subject)
    print(body)
    if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body):
        print("ERROR: Mail not sent")
    else:
        print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL,))
Esempio n. 26
0
def main(args):
    if len(args) != 1:
        print("usage: python bibfilter_oaipos2inspire.py input_filename")
        raise Exception("Wrong usage!!")
    input_filename = args[0]

    out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY)

    insert_records = []
    append_records = []
    error_records = []
    files_uploaded = []

    pos = PosPackage()
    xml_doc = parse(input_filename)
    for record in xml_doc.getElementsByTagName('record'):
        rec = pos.get_record(record)
        identifier = pos.get_identifier()
        conference = identifier.split(':')[2]
        conference = conference.split('/')[0]
        contribution = identifier.split(':')[2]
        contribution = contribution.split('/')[1]
        identifier = "PoS(%s)%s" % (conference, contribution)
        query = "773__p:pos 773__v:%s 773__c:%s" % \
                (conference.replace(' ', ''), contribution)
        print("Querying with: %s" % (query, ))
        results = perform_request_search(p=query, of="id")

        #harvest fulltext
        url = base_url + identifier
        session = requests.session()
        r = session.get(url)
        parsed_html = BeautifulSoup(r.text)
        links = parsed_html.body.findAll('a')
        found = False

        for link in links:
            url = urllib.quote(link['href'], safe=":/")
            if url.endswith('.pdf'):
                found = True
                if results:
                    rec = create_record()
                filename = join(out_folder, identifier + ".pdf")
                record_add_field(rec,
                                 '856',
                                 ind1='4',
                                 subfields=[('u', url), ('y', 'PoS server')])
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', filename), ('t', 'PoS'),
                                            ('d', 'Fulltext')])
                try:
                    print('Downloading ' + url)
                    download_url(url, "pdf", filename, 5, 60.0)
                    if results:
                        recid = results[0]
                        record_add_field(rec, '001', controlfield_value=recid)
                        append_records.append(rec)
                    else:
                        insert_records.append(rec)
                except InvenioFileDownloadError:
                    print("Download of %s failed" % (url, ))
                break
        if not found:
            error_records.append(rec)

        #upload to FTP
        tempfile_path = '/tmp/%s.xml' % (contribution, )
        with open(tempfile_path, 'w') as tempfile:
            tempfile.write(record_xml_output(rec))
        try:
            submit_records_via_ftp(tempfile_path, conference)
            files_uploaded.append('%s/%s.xml' % (conference, contribution))
            write_message("%s successfully uploaded to FTP server" %
                          tempfile_path)
        except:
            write_message("Failed to upload %s to FTP server" % tempfile_path)
        remove(tempfile_path)

    insert_filename = "%s.insert.xml" % (input_filename, )
    append_filename = "%s.append.xml" % (input_filename, )
    errors_filename = "%s.errors.xml" % (input_filename, )

    created_files = []

    if write_record_to_file(insert_filename, insert_records):
        copy(insert_filename, out_folder)
        created_files.append(join(out_folder, basename(insert_filename)))
    if write_record_to_file(append_filename, append_records):
        copy(append_filename, out_folder)
        created_files.append(join(out_folder, basename(append_filename)))
    if write_record_to_file(errors_filename, error_records):
        copy(errors_filename, errors_filename)
        created_files.append(join(out_folder, basename(errors_filename)))

    total_records = len(append_records) + len(insert_records) + len(
        error_records)
    subject = "PoS Harvest results: " + datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S")
    body = """
    Total of %d records processed:

    %d new records,
    %d records already existing in the system,
    %d records that failed to retrieve the fulltext

    Location of new records:
    %s
    """ % \
           (total_records,
            len(insert_records),
            len(append_records),
            len(error_records),
            "\n".join(created_files))
    if files_uploaded:
        body += "\nFiles uploaded:"
        for fl in files_uploaded:
            body += "\n\t%s file uploaded on the FTP Server\n" % (fl, )
    write_message(subject)
    write_message(body)
    if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject,
                      body):
        print("ERROR: Mail not sent")
    else:
        print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL, ))
def harvest_single(single, to_dir, selection=("tarball", "pdf")):
    """
    if we only want to harvest one id (arXiv or DESY), we can use this.

    @param: single (string): an id from arXiv or DESY
    @param: to_dir (string): where the output should be saved

    @output: the PDF and source tarball (if applicable) of this single record

    @return: (tarball, pdf): the location of the source tarball and PDF, None
            if not found
    """

    if single.find(
            'arXiv'
    ) > -1 and 'arxiv.org' in CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.lower():
        id_str = re.findall('[a-zA-Z\\-]+/\\d+|\\d+\\.\\d+', single)[0]
        idno = id_str.split('/')
        if len(idno) > 0:
            idno = idno[-1]
        yymm = int(idno[:4])
        yymm_dir = make_useful_directories(yymm, to_dir)
        url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \
                       CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       id_str
        url_for_pdf = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \
                      CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                      id_str + '.pdf' # adds '.pdf' to avoid arXiv internal redirect from arXivID to arXivID.pdf
        individual_file = 'arXiv:' + id_str.replace('/', '_')
        individual_dir = make_single_directory(yymm_dir, individual_file)
        abs_path = os.path.join(individual_dir, individual_file)
        tarball = abs_path
        pdf = abs_path + '.pdf'

        try:
            if "tarball" in selection:
                write_message('downloading ' + url_for_file + ' to ' + tarball)
                tarball = download_url(url=url_for_file,
                                       content_type='tar',
                                       download_to_file=tarball)
        except InvenioFileDownloadError:
            tarball = None

        try:
            if "pdf" in selection:
                write_message('downloading ' + url_for_pdf + ' to ' + pdf)
                pdf = download_url(url=url_for_pdf,
                                   content_type="pdf",
                                   download_to_file=pdf)
        except InvenioFileDownloadError:
            pdf = None

        return (tarball, pdf)

    elif single.find('arXiv') > -1 and CFG_PLOTEXTRACTOR_SOURCE_BASE_URL != '':
        # hmm... is it a filesystem?
        if CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith('/'):
            if not os.path.exists(CFG_PLOTEXTRACTOR_SOURCE_BASE_URL):
                write_message('PROBLEM WITH CFG_PLOTEXTRACTOR_SOURCE_BASE_URL: we cannot ' + \
                        'find this folder!')
                return (None, None)
            for root, files, dummy in os.walk(
                    CFG_PLOTEXTRACTOR_SOURCE_BASE_URL):
                for file_name in files:
                    id_no = single.replace('arXiv', '')
                    if file_name.find(id_no) > -1 or\
                       file_name.find(id_no.replace('/', '_')) > -1 or\
                       file_name.find(id_no.replace('_', '/')) > -1 or\
                       file_name.find(id_no.replace(':', '')) > -1:
                        # that's our file!  probably.
                        return (os.path.join(root, file_name), None)

            # well, no luck there
            return (None, None)

        # okay... is it... a website?
        elif CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith(
                'http') and "tarball" in selection:
            url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + single
            individual_file = os.path.join(to_dir, single)
            abs_path = os.path.join(to_dir, individual_file)
            try:
                abs_path = download_url(url=url_for_file,
                                        content_type='tar',
                                        download_to_file=abs_path)
            except InvenioFileDownloadError:
                abs_path = None
            return (abs_path, None)

        # well, I don't know what to do with it
        else:
            write_message('unsure how to handle CFG_PLOTEXTRACTOR_SOURCE_BASE_URL. ' + \
                  'please fix the harvest_single function in ' + \
                  'miscutil/lib/plotextractor_getter.py')
            return (None, None)

    elif single.find('DESY') > -1 and "pdf" in selection:
        # also okay!
        idno = re.findall('\\d{2,4}-\\d{3}', single)[0]
        year, number = idno.split('-')
        if len(year) < 4:
            if int(year) > 92:
                year = '19' + year
            else:
                year = '20' + year
        year_dir = make_single_directory(to_dir, year)
        desy_dir = make_single_directory(year_dir, 'DESY')
        individual_dir = make_single_directory(desy_dir, number)
        id_no = year[2:] + '-' + number + '.pdf'
        url_for_file = CFG_PLOTEXTRACTOR_DESY_BASE + year + \
                       CFG_PLOTEXTRACTOR_DESY_PIECE + id_no
        individual_file = id_no
        abs_path = os.path.join(individual_dir, individual_file)
        write_message('download ' + url_for_file + ' to ' + abs_path)
        try:
            abs_path = download_url(url=url_for_file,
                                    content_type='pdf',
                                    download_to_file=abs_path)
        except InvenioFileDownloadError:
            abs_path = None
        return (None, abs_path)
    write_message('END')
    return (None, None)
Esempio n. 28
0
def download_feed(feed_url, delete_zip, new_sources, directory, feed_location):
    """ Get list of entries from XML document """
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed_url,
                                   content_type="xml",
                                   download_to_file=feed_location,
                                   retry_count=5,
                                   timeout=60.0)
    except InvenioFileDownloadError as err:
        _errors_detected.append(err)
        write_message("URL could not be opened: %s" % (feed_url, ))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
    xml_files = []
    entries = parse_feed(result_path)

    if not entries:
        return xml_files

    # look what files already exist
    # there are currently O(10^5) files in the directory tree rooted
    # at CFG_CONSYN_OUT_DIRECTORY and it is on AFS and takes upwards
    # of 5 minutes to walk.
    # might make sense to have a db table with already harvested files
    task_sleep_now_if_required()
    allfilenames = find_names_of_existing_files(CFG_CONSYN_OUT_DIRECTORY)
    task_sleep_now_if_required()

    for fileUrl, fileName in entries:
        if fileName in allfilenames:
            write_message(
                "Not downloading %s, found file with same name in %s" % (
                    fileName,
                    CFG_CONSYN_OUT_DIRECTORY,
                ))
            continue
        task_sleep_now_if_required()

        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        fileUrl = fileUrl.replace(' ', '%20')
        try:
            write_message("Downloading %s to %s\n" % (fileUrl, outFilename))
            download_url(fileUrl, "zip", outFilename, 5, 60.0)
            new_sources.append(outFilename)
        except InvenioFileDownloadError as err:
            _errors_detected.append(err)
            write_message("URL could not be opened: %s" % fileUrl)
            write_message(str(err))
            write_message(traceback.format_exc()[:-1])
            task_update_status("CERROR")
            continue
        try:
            xml_files.extend(extractAll(outFilename, delete_zip, directory))
        except BadZipfile:
            _errors_detected.append(err)
            write_message("Error BadZipfile %s", (outFilename, ))
            task_update_status("CERROR")
            remove(outFilename)

    return xml_files
Esempio n. 29
0
def bst_consyn_harvest(feed_url=None,
                       package=None,
                       feed_file=None,
                       package_list_file=None,
                       batch_size='500',
                       delete_zip='False',
                       submit='False',
                       threshold_date=None):
    """ Task to convert xml files from consyn.elsevier.com to Marc xml files.
    There are four execution modes:
    1. Download from an atom feed url.
    2. Extract and convert a zip package.
    3. Download from an atom feed file.
    4. Extract and convert a list of zip packages.

    The feed is stored to the file system under the folder feeds.
    If no errors occur during the execution of the tasklet the feed
    is deleted. Records may be recovered running the tasklet again with
    the modes 2, 3 or 4.

    :param feed_url: A URL to the atom feed.
    :type feed: string.

    :param package: A path to a zip package.
    :type package: string.

    :param package: A path to an atom feed file.
    :type package: string.

    :param package_list_file: A path to a file with a list of paths
                              to zip packages. The file must contain
                              the path to each package in a different
                              line.
    :type package_list_file: string.

    :param batch_size: The number of records contained in each output file.
    :type batch_size: string representation of an integer.

    :param delete_zip: Flag to indicate if the downloaded zip files
                       should be kept on the disk or not.
    :type delete_zip: string representation of a boolean.

    :param submit: Flag to indicate whether the result files
                       should be submited by email and uploaded
                       to FTP server.
    :type submit: string representation of a boolean.
    :param threshold_date: threshold date only converts records that they were
                      published after threshold_date
    :type threshold_date: string in the format YYYY-MM-DD
    """
    if not feed_url:
        feed_url = "https://consyn.elsevier.com/batch/atom?key=%s" % \
                   (CFG_CONSYN_ATOM_KEY,)
    new_files = []
    new_sources = []
    feed_location = ''

    try:
        batch_size = int(batch_size)
    except ValueError:
        batch_size = 500
        write_message('Warning batch_size parameter is not a valid integer\n'
                      'the default value \'500\' has been used!\n')
    if delete_zip.lower() == 'true':
        delete_zip = True
    elif delete_zip.lower() == 'false':
        delete_zip = False
    else:
        delete_zip = False
        write_message('Warning delete_zip parameter is not'
                      ' a valid Boolean (True/False)\n'
                      'the default value \'False\' has been used!\n')
    if submit.lower() == 'true':
        submit = True
    elif submit.lower() == 'false':
        submit = False
    else:
        submit = False
        write_message('Warning upload_FTP parameter is not'
                      ' a valid Boolean (True/False)\n'
                      'the default value \'False\' has been used!\n')
    if threshold_date:
        import time
        date_format = "%Y-%m-%d"
        try:
            date = datetime(*(time.strptime(threshold_date, date_format)[0:6]))
            threshold_date = date.strftime('%Y-%m-%d')
        except ValueError:
            write_message('Error threshold_date parameter is not '
                          'in the right format. It should be in '
                          'form "YYYY-MM-DD".')
            task_update_status("ERROR")
            return

    if not exists(CFG_CONSYN_OUT_DIRECTORY):
        makedirs(CFG_CONSYN_OUT_DIRECTORY)
    out_folder = CFG_CONSYN_OUT_DIRECTORY
    journal_mappings = get_kbs()['journals'][1]
    els = ElsevierPackage(CONSYN=True, journal_mappings=journal_mappings)

    consyn_files = join(out_folder, "consyn-files")
    consyn_files = consyn_files.lstrip()

    if package:
        xml_files = extract_package(package, delete_zip, out_folder,
                                    new_sources)
    elif package_list_file:
        package_list = []
        with open(package_list_file, 'r') as package_file:
            for line in package_file:
                line = line.strip()
                if line:
                    package_list.append(line)
        xml_files = extract_multiple_packages(package_list, delete_zip,
                                              new_sources, out_folder)
    elif feed_file:
        entries = parse_feed(feed_file)
        links = [a[0] for a in entries]
        package_list = [a[1] for a in entries]
        package_list = [
            join(CFG_CONSYN_OUT_DIRECTORY, a) for a in package_list
        ]
        for package in package_list:
            task_sleep_now_if_required()
            if not exists(package):
                index = package_list.index(package)
                link = links[index]
                link = link.replace(' ', '%20')
                try:
                    message = ("Downloading %s to %s\n" % (link, package))
                    write_message(message)
                    download_url(link, "zip", package, 5, 60.0)
                    package_list.append(package)
                except InvenioFileDownloadError as err:
                    message = "URL could not be opened: " + link
                    write_message(message)
                    write_message(str(err))
                    write_message(traceback.format_exc()[:-1])
                    task_update_status("CERROR")
                    continue
            xml_files = extract_multiple_packages(package_list, delete_zip,
                                                  new_sources, out_folder)
    else:
        feeds_folder = join(CFG_CONSYN_OUT_DIRECTORY, 'feeds')
        if not exists(feeds_folder):
            makedirs(feeds_folder)
        date = datetime.now().strftime("%Y.%m.%d")
        feed_location = "feed-%s.xml" % date
        feed_location = join(feeds_folder, feed_location)
        xml_files = download_feed(feed_url, delete_zip, new_sources,
                                  out_folder, feed_location)
    task_update_progress("Converting files 2/3...")
    task_sleep_now_if_required()
    results = convert_files(xml_files,
                            els,
                            prefix=consyn_files,
                            threshold_date=threshold_date)
    for dummy, (status_code, result) in results.iteritems():
        if status_code == StatusCodes.OK:
            new_files.append(result)
    task_update_progress("Compiling output 3/3...")
    task_sleep_now_if_required()
    create_collection(batch_size, new_files, new_sources, out_folder, submit)
    if feed_location and not _errors_detected:
        remove(feed_location)
    for error in _errors_detected:
        write_message(str(error))
Esempio n. 30
0
        # Get URL and filename
        fileUrl = entry.getElementsByTagName("link")[0].getAttribute("href")
        fileName = entry.getElementsByTagName("title")[0].firstChild.data
        updated = entry.getElementsByTagName("updated")[0].firstChild.data
        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        #file has already been fetched
        if outFilename in downloaded_files:
            write_message("Not downloading %s, already found %s\n" %
                          (fileUrl, outFilename))
        else:
            try:
                write_message("Downloading %s to %s\n" % (fileUrl, outFilename))
                download_url(fileUrl, "zip", outFilename, 5, 60.0)
                new_sources.append(outFilename)
            except InvenioFileDownloadError, err:
                write_message("URL could not be opened: %s" % (fileUrl,))
                write_message(str(err))
                write_message(traceback.format_exc()[:-1])
                task_update_status("CERROR")
                continue
            size = getsize(outFilename)
            run_sql("INSERT INTO CONSYNHARVEST"
                    "(filename,date,size)"
                    "VALUES (%s,%s,%s)",
                    (outFilename, updated, size))
            try:
                extractAll(outFilename, delete_zip, directory)
            except BadZipfile:
Esempio n. 31
0
def download_feed(feed_url, delete_zip, new_sources, directory, feed_location):
    """ Get list of entries from XML document """
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed_url,
                                   content_type="xml",
                                   download_to_file=feed_location,
                                   retry_count=5,
                                   timeout=60.0)
    except InvenioFileDownloadError as err:
        _errors_detected.append(err)
        write_message("URL could not be opened: %s" % (feed_url,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
    xml_files = []
    entries = parse_feed(result_path)

    if not entries:
        return xml_files

    # look what files already exist
    # there are currently O(10^5) files in the directory tree rooted
    # at CFG_CONSYN_OUT_DIRECTORY and it is on AFS and takes upwards
    # of 5 minutes to walk.
    # might make sense to have a db table with already harvested files
    task_sleep_now_if_required()
    allfilenames = find_names_of_existing_files(CFG_CONSYN_OUT_DIRECTORY)
    task_sleep_now_if_required()

    for fileUrl, fileName in entries:
        if fileName in allfilenames:
            write_message("Not downloading %s, found file with same name in %s"
                          % (fileName, CFG_CONSYN_OUT_DIRECTORY,))
            continue
        task_sleep_now_if_required()

        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        fileUrl = fileUrl.replace(' ', '%20')
        try:
            write_message("Downloading %s to %s\n" % (fileUrl,
                                                      outFilename))
            download_url(fileUrl, "zip", outFilename, 5, 60.0)
            new_sources.append(outFilename)
        except InvenioFileDownloadError as err:
            _errors_detected.append(err)
            write_message("URL could not be opened: %s" % fileUrl)
            write_message(str(err))
            write_message(traceback.format_exc()[:-1])
            task_update_status("CERROR")
            continue
        try:
            xml_files.extend(extractAll(outFilename,
                                        delete_zip,
                                        directory))
        except BadZipfile:
            _errors_detected.append(err)
            write_message("Error BadZipfile %s", (outFilename,))
            task_update_status("CERROR")
            remove(outFilename)

    return xml_files
Esempio n. 32
0
    def perform_fulltext_harvest(self, record_list, parameters):
        """
        For every record in given list APSRecord(record ID, DOI, date last
        updated), yield a APSRecord with added FFT dictionary containing URL to
        fulltext/metadata XML downloaded locally.

        If a download is unsuccessful, an error message is given.

        @return: tuple of (APSRecord, error_message)
        """
        count = 0
        request_end = None
        request_start = None
        for record in record_list:
            task_sleep_now_if_required(can_stop_too=False)
            # Unless this is the first request, lets sleep a bit
            if request_end and request_start:
                request_dt = request_end-request_start
                write_message("Checking request time (%d)"
                              % (request_dt,), verbose=3)
                if count and request_dt > 0 and request_dt < CFG_APSHARVEST_REQUEST_TIMEOUT:
                    write_message("Initiating sleep for %.1f seconds"
                                  % (request_dt,), verbose=3)
                    time.sleep(request_dt)

            count += 1
            task_update_progress("Harvesting record (%d/%d)" % (count,
                                                                len(record_list)))

            if not record.doi:
                msg = "No DOI found for record %d" % (record.recid or "",)
                write_message("Error: %s" % (msg,), stream=sys.stderr)
                yield record, msg
                continue

            url = CFG_APSHARVEST_FULLTEXT_URL % {'doi': record.doi}
            result_file = os.path.join(self.zip_folder,
                                       "%s.zip" % (record.doi.replace('/', '_')))
            try:
                request_start = time.time()
                if os.path.exists(result_file):
                    # File already downloaded recently, lets see if it is the same
                    file_last_modified = get_file_modified_date(result_file)
                    if record.last_modified and not compare_datetime_to_iso8601_date(file_last_modified, record.last_modified):
                        # File is not older than APS version, we should not download.
                        raise APSHarvesterFileExits

                write_message("Trying to save to %s" % (result_file,), verbose=5)

                result_file = download_url(url=url,
                                           download_to_file=result_file,
                                           content_type="zip",
                                           accept="application/zip",
                                           retry_count=5,
                                           timeout=60.0)
                write_message("Downloaded %s to %s" % (url, result_file), verbose=2)
            except InvenioFileDownloadError, e:
                msg = "URL could not be opened: %s" % (url,)
                write_message("Error: %s" % (msg,),
                              stream=sys.stderr)
                yield record, msg
                continue

            except APSHarvesterFileExits:
                write_message("File exists at %s" % (result_file,), verbose=2)
        # Get URL and filename
        fileUrl = entry.getElementsByTagName("link")[0].getAttribute("href")
        fileName = entry.getElementsByTagName("title")[0].firstChild.data
        updated = entry.getElementsByTagName("updated")[0].firstChild.data
        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        #file has already been fetched
        if outFilename in downloaded_files:
            write_message("Not downloading %s, already found %s\n" %
                          (fileUrl, outFilename))
        else:
            try:
                write_message("Downloading %s to %s\n" % (fileUrl, outFilename))
                download_url(fileUrl, "zip", outFilename, 5, 60.0)
                new_sources.append(outFilename)
            except InvenioFileDownloadError, err:
                write_message("URL could not be opened: %s" % (fileUrl,))
                write_message(str(err))
                write_message(traceback.format_exc()[:-1])
                task_update_status("CERROR")
                continue
            size = getsize(outFilename)
            run_sql("INSERT INTO CONSYNHARVEST"
                    "(filename,date,size)"
                    "VALUES (%s,%s,%s)",
                    (outFilename, updated, size))
            try:
                extractAll(outFilename, delete_zip, directory)
            except BadZipfile: