Ejemplo n.º 1
0
def arxiv_fft_get(obj, eng):
    """Get FFT from arXiv, if arXiv ID is provided."""
    deposition = Deposition(obj)
    sip = deposition.get_latest_sip(sealed=False)
    metadata = sip.metadata

    if 'arxiv_id' in metadata and metadata['arxiv_id']:
        arxiv_pdf_url = cfg.get("ARXIV_PDF_URL", "http://arxiv.org/pdf/") + \
            "{0}.{1}"

        from invenio.config import CFG_TMPSHAREDDIR
        arxiv_file, arxiv_file_path = mkstemp(
            prefix="%s_" % (metadata['arxiv_id'].replace("/", "_")),
            suffix='.pdf',
            dir=CFG_TMPSHAREDDIR,
        )
        os.close(arxiv_file)

        download_url(url=arxiv_pdf_url.format(metadata['arxiv_id'], "pdf"),
                     content_type="pdf",
                     download_to_file=arxiv_file_path)

        # To get 1111.2222.pdf as filename.
        filename = "{0}.pdf".format(metadata['arxiv_id'].replace("/", "_"))

        try:
            try:
                save_deposition_file(deposition,
                                     filename,
                                     arxiv_file_path)
            except FilenameAlreadyExists:
                obj.log.error("PDF file not saved: filename already exists.")
        except Exception as e:
            obj.log.error("PDF file not saved: {}.".format(e.message))
Ejemplo n.º 2
0
def old_URL_harvest(from_date, to_date, to_dir, area):
    """
        Grab all the PDFs and tarballs off arXiv between from_date and to_date,
        where from_date and to_date are in YYMM form, and put them in their own
        separate folders inside of to_dir.  Folder hierarchy will be
            to_dir/YYYY/MM/arXiv_id/stuff_downloaded_from_arXiv
        this obeys the old URL format

        @param: from_date (int): YYMM form of the date where we want to start
            harvesting
        @param: to_date (int): YYMM form of the date where we want to stop
            harvesting
        @param: to_dir (string): the base directory to put all these subdirs in
        @param: area (int): the index in the HEP_AREAS array of the area we are
            currently working on downloading

        @output: PDFs and tarballs from arXiv in a hierarchy rooted at to_dir
        @return: None
    """

    yearmonthindex = from_date

    while yearmonthindex < to_date:

        sub_dir = make_useful_directories(yearmonthindex, to_dir)

        for paperindex in range(1, 1000):
            # for whatever reason, we can't count on these things to
            # start at 1 (in HEP_PH from 9403 to CENTURY_END only).
            # they start at frickin 202.
            #if area == HEP_PH and yearmonthindex < ARBITRARY_FROM_INDEX:
            #   paperindex = paperindex + 201
            # of note: before the URL change happened in 0704, it was
            # also the case that the paper numbers only had 3 digits
            next_to_harvest = '%04d%03d' % (yearmonthindex, paperindex)
            arXiv_id = area[AREA_STRING_INDEX] + next_to_harvest
            individual_dir = make_single_directory(sub_dir, arXiv_id)

            full_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       area[URL] + next_to_harvest
            abs_path = os.path.join(individual_dir,
                                    area[AREA_STRING_INDEX] + next_to_harvest)
            if not download_url(url=full_url,
                                content_type='tar',
                                download_to_file=abs_path):
                break
            full_pdf_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                           area[URL] + next_to_harvest
            abs_path = os.path.join(
                individual_dir,
                area[AREA_STRING_INDEX] + next_to_harvest + PDF_EXTENSION)
            download_url(url=full_pdf_url,
                         content_type='pdf',
                         download_to_file=abs_path)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT)
        if yearmonthindex % 100 == 12:
            # we reached the end of the year!
            yearmonthindex = yearmonthindex + FIX_FOR_YEAR_END
        yearmonthindex = yearmonthindex + 1
Ejemplo n.º 3
0
def new_URL_harvest(from_date, from_index, to_dir):
    """
        Grab all the PDFs and tarballs off arXiv between from_date and to_date,
        where from_date and to_date are in YYMM form, and put them in their own
        separate folders inside of to_dir.  Folder hierarchy will be
            to_dir/YYYY/MM/arXiv_id/stuff_downloaded_from_arXiv
        this obeys the new URL format

        @param: from_date (int): YYMM form of the date where we want to start
            harvesting
        @param: to_date (int): YYMM form of the date where we want to stop
            harvesting
        @param: to_dir (string): the base directory to put all these subdirs in

        @output: PDFs and tarballs from arXiv in a hierarchy rooted at to_dir
        @return: None
    """

    global current_yearmonth
    yearmonthindex = from_date

    while yearmonthindex < current_yearmonth:

        if yearmonthindex == from_date:
            fro = from_index
        else:
            fro = 1

        sub_dir = make_useful_directories(yearmonthindex, to_dir)

        for paperindex in range(fro, 10000):

            # of note: after the URL change happened in 0704, it was
            # the case that paper numbers had 4 digits
            next_to_harvest = '%04d.%04d' % (yearmonthindex, paperindex)
            arXiv_id = ARXIV_HEADER + next_to_harvest
            individual_dir = make_single_directory(sub_dir, arXiv_id)

            full_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       next_to_harvest
            abs_path = os.path.join(individual_dir, ARXIV_HEADER + next_to_harvest)
            if not download_url(url=full_url,
                                content_type='tar',
                                download_to_file=abs_path):
                break

            full_pdf_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                           next_to_harvest
            abs_path = os.path.join(individual_dir, ARXIV_HEADER + next_to_harvest + PDF_EXTENSION)
            download_url(url=full_pdf_url,
                         content_type='pdf',
                         download_to_file=abs_path)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) # be nice to remote server

        if yearmonthindex % 100 == 12:
            # we reached the end of the year!
            yearmonthindex = yearmonthindex + FIX_FOR_YEAR_END
        yearmonthindex = yearmonthindex + 1
Ejemplo n.º 4
0
    def test_content_type(self):
        """Test simple calls to download_url."""
        from invenio.utils.filedownload import (download_url,
                                                InvenioFileDownloadError)
        tmpdoc = download_url("http://duckduckgo.com", content_type="html")
        self.assertTrue(tmpdoc)

        fun = lambda: download_url("http://google.com", content_type="pdf")
        self.assertRaises(InvenioFileDownloadError, fun)
Ejemplo n.º 5
0
    def test_content_type(self):
        """Test simple calls to download_url."""
        from invenio.utils.filedownload import (download_url,
                                                InvenioFileDownloadError)
        tmpdoc = download_url("http://duckduckgo.com", content_type="html")
        self.assertTrue(tmpdoc)

        fun = lambda: download_url("http://google.com", content_type="pdf")
        self.assertRaises(InvenioFileDownloadError, fun)
Ejemplo n.º 6
0
def old_URL_harvest(from_date, to_date, to_dir, area):
    """
        Grab all the PDFs and tarballs off arXiv between from_date and to_date,
        where from_date and to_date are in YYMM form, and put them in their own
        separate folders inside of to_dir.  Folder hierarchy will be
            to_dir/YYYY/MM/arXiv_id/stuff_downloaded_from_arXiv
        this obeys the old URL format

        @param: from_date (int): YYMM form of the date where we want to start
            harvesting
        @param: to_date (int): YYMM form of the date where we want to stop
            harvesting
        @param: to_dir (string): the base directory to put all these subdirs in
        @param: area (int): the index in the HEP_AREAS array of the area we are
            currently working on downloading

        @output: PDFs and tarballs from arXiv in a hierarchy rooted at to_dir
        @return: None
    """

    yearmonthindex = from_date

    while yearmonthindex < to_date:

        sub_dir = make_useful_directories(yearmonthindex, to_dir)

        for paperindex in range(1, 1000):
            # for whatever reason, we can't count on these things to
            # start at 1 (in HEP_PH from 9403 to CENTURY_END only).
            # they start at frickin 202.
            #if area == HEP_PH and yearmonthindex < ARBITRARY_FROM_INDEX:
            #   paperindex = paperindex + 201
            # of note: before the URL change happened in 0704, it was
            # also the case that the paper numbers only had 3 digits
            next_to_harvest = '%04d%03d' % (yearmonthindex, paperindex)
            arXiv_id = area[AREA_STRING_INDEX] + next_to_harvest
            individual_dir = make_single_directory(sub_dir, arXiv_id)

            full_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       area[URL] + next_to_harvest
            abs_path = os.path.join(individual_dir, area[AREA_STRING_INDEX] + next_to_harvest)
            if not download_url(url=full_url,
                                content_type='tar',
                                download_to_file=abs_path):
                break
            full_pdf_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                           area[URL] + next_to_harvest
            abs_path = os.path.join(individual_dir, area[AREA_STRING_INDEX] + next_to_harvest + PDF_EXTENSION)
            download_url(url=full_pdf_url,
                         content_type='pdf',
                         download_to_file=abs_path)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT)
        if yearmonthindex % 100 == 12:
           # we reached the end of the year!
            yearmonthindex = yearmonthindex + FIX_FOR_YEAR_END
        yearmonthindex = yearmonthindex + 1
Ejemplo n.º 7
0
def parse_and_download(infile, sdir):
    """
    Read the write_messageation in the input file and download the corresponding
    tarballs from arxiv.

    @param: infile (string): the name of the file to parse
    @param: sdir (string): where to put the downloaded tarballs
    """

    tarfiles = []

    tardir = os.path.join(sdir, 'tarballs')
    if not os.path.isdir(tardir):
        try:
            os.makedirs(tardir)
        except:
            write_message(sys.exc_info()[0])
            write_message('files will be loose, not in ' + tardir)
            tardir = sdir

    infile = open(infile)
    for line in infile.readlines():
        line = line.strip()
        if line.startswith('http://'):
            # hurray!
            url = line
            filename = url.split('/')[-1]
            abs_path = os.path.join(tardir, filename)
            if not download_url(url=url,
                                content_type='tar',
                                download_to_file=abs_path):
                write_message(filename + ' may already exist')
                write_message(sys.exc_info()[0])
            filename = os.path.join(tardir, filename)
            tarfiles.append(filename)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) # be nice!
        elif line.startswith('arXiv'):
            tarfiles.extend(tarballs_by_arXiv_id([line.strip()], sdir))

    return tarfiles
Ejemplo n.º 8
0
def parse_and_download(infile, sdir):
    """
    Read the write_messageation in the input file and download the corresponding
    tarballs from arxiv.

    @param: infile (string): the name of the file to parse
    @param: sdir (string): where to put the downloaded tarballs
    """

    tarfiles = []

    tardir = os.path.join(sdir, 'tarballs')
    if not os.path.isdir(tardir):
        try:
            os.makedirs(tardir)
        except:
            write_message(sys.exc_info()[0])
            write_message('files will be loose, not in ' + tardir)
            tardir = sdir

    infile = open(infile)
    for line in infile.readlines():
        line = line.strip()
        if line.startswith('http://'):
            # hurray!
            url = line
            filename = url.split('/')[-1]
            abs_path = os.path.join(tardir, filename)
            if not download_url(
                    url=url, content_type='tar', download_to_file=abs_path):
                write_message(filename + ' may already exist')
                write_message(sys.exc_info()[0])
            filename = os.path.join(tardir, filename)
            tarfiles.append(filename)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT)  # be nice!
        elif line.startswith('arXiv'):
            tarfiles.extend(tarballs_by_arXiv_id([line.strip()], sdir))

    return tarfiles
Ejemplo n.º 9
0
def output_keywords_for_sources(input_sources, taxonomy_name, output_mode="text",
                                output_limit=cfg['CLASSIFIER_DEFAULT_OUTPUT_NUMBER'], spires=False,
                                match_mode="full", no_cache=False, with_author_keywords=False,
                                rebuild_cache=False, only_core_tags=False, extract_acronyms=False,
                                **kwargs):
    """Output the keywords for each source in sources."""
    from invenio.legacy.refextract.engine import get_plaintext_document_body

    # Inner function which does the job and it would be too much work to
    # refactor the call (and it must be outside the loop, before it did
    # not process multiple files)
    def process_lines():
        if output_mode == "text":
            print("Input file: %s" % source)

        line_nb = len(text_lines)
        word_nb = 0
        for line in text_lines:
            word_nb += len(re.findall("\S+", line))

        current_app.logger.info("Remote file has %d lines and %d words.".format(
            line_nb, word_nb
        ))
        return get_keywords_from_text(
            text_lines,
            taxonomy_name,
            output_mode=output_mode,
            output_limit=output_limit,
            spires=spires,
            match_mode=match_mode,
            no_cache=no_cache,
            with_author_keywords=with_author_keywords,
            rebuild_cache=rebuild_cache,
            only_core_tags=only_core_tags,
            extract_acronyms=extract_acronyms
        )

    # Get the fulltext for each source.
    for entry in input_sources:
        current_app.logger.info("Trying to read input file %s." % entry)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if filename.startswith('.'):
                    continue
                filename = os.path.join(entry, filename)
                if os.path.isfile(filename):
                    text_lines, dummy = get_plaintext_document_body(filename)
                    if text_lines:
                        source = filename
                        process_lines()
        elif os.path.isfile(entry):
            text_lines, dummy = get_plaintext_document_body(entry)
            if text_lines:
                source = os.path.basename(entry)
                process_lines()
        else:
            # Treat as a URL.
            local_file = download_url(entry)
            text_lines, dummy = get_plaintext_document_body(local_file)
            if text_lines:
                source = entry.split("/")[-1]
                process_lines()
Ejemplo n.º 10
0
def harvest_single(single, to_dir, selection=("tarball", "pdf")):
    """
    if we only want to harvest one id (arXiv or DESY), we can use this.

    @param: single (string): an id from arXiv or DESY
    @param: to_dir (string): where the output should be saved

    @output: the PDF and source tarball (if applicable) of this single record

    @return: (tarball, pdf): the location of the source tarball and PDF, None
            if not found
    """

    if single.find('arXiv') > -1 and 'arxiv.org' in CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.lower():
        id_str = re.findall('[a-zA-Z\\-]+/\\d+|\\d+\\.\\d+', single)[0]
        idno = id_str.split('/')
        if len(idno) > 0:
            idno = idno[-1]
        yymm = int(idno[:4])
        yymm_dir = make_useful_directories(yymm, to_dir)
        url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \
                       CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       id_str
        url_for_pdf = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \
                      CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                      id_str + '.pdf' # adds '.pdf' to avoid arXiv internal redirect from arXivID to arXivID.pdf
        individual_file = 'arXiv:' + id_str.replace('/', '_')
        individual_dir = make_single_directory(yymm_dir, individual_file)
        abs_path = os.path.join(individual_dir, individual_file)
        tarball = abs_path
        pdf = abs_path + '.pdf'

        try:
            if "tarball" in selection:
                write_message('downloading ' + url_for_file + ' to ' + tarball)
                tarball = download_url(url=url_for_file,
                                       content_type='tar',
                                       download_to_file=tarball)
        except InvenioFileDownloadError:
            tarball = None

        try:
            if "pdf" in selection:
                write_message('downloading ' + url_for_pdf + ' to ' + pdf)
                pdf = download_url(url=url_for_pdf,
                                   content_type="pdf",
                                   download_to_file=pdf)
        except InvenioFileDownloadError:
            pdf = None

        return (tarball, pdf)

    elif single.find('arXiv') > -1 and CFG_PLOTEXTRACTOR_SOURCE_BASE_URL != '':
        # hmm... is it a filesystem?
        if CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith('/'):
            if not os.path.exists(CFG_PLOTEXTRACTOR_SOURCE_BASE_URL):
                write_message('PROBLEM WITH CFG_PLOTEXTRACTOR_SOURCE_BASE_URL: we cannot ' + \
                        'find this folder!')
                return (None, None)
            for root, files, dummy in os.walk(CFG_PLOTEXTRACTOR_SOURCE_BASE_URL):
                for file_name in files:
                    id_no = single.replace('arXiv', '')
                    if file_name.find(id_no) > -1 or\
                       file_name.find(id_no.replace('/', '_')) > -1 or\
                       file_name.find(id_no.replace('_', '/')) > -1 or\
                       file_name.find(id_no.replace(':', '')) > -1:
                        # that's our file!  probably.
                        return (os.path.join(root, file_name), None)

            # well, no luck there
            return (None, None)

        # okay... is it... a website?
        elif CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith('http') and "tarball" in selection:
            url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + single
            individual_file = os.path.join(to_dir, single)
            abs_path = os.path.join(to_dir, individual_file)
            try:
                abs_path = download_url(url=url_for_file,
                                        content_type='tar',
                                        download_to_file=abs_path)
            except InvenioFileDownloadError:
                abs_path = None
            return (abs_path, None)

        # well, I don't know what to do with it
        else:
            write_message('unsure how to handle CFG_PLOTEXTRACTOR_SOURCE_BASE_URL. ' + \
                  'please fix the harvest_single function in ' + \
                  'miscutil/lib/plotextractor_getter.py')
            return (None, None)

    elif single.find('DESY') > -1 and "pdf" in selection:
        # also okay!
        idno = re.findall('\\d{2,4}-\\d{3}', single)[0]
        year, number = idno.split('-')
        if len(year) < 4:
            if int(year) > 92:
                year = '19' + year
            else:
                year = '20' + year
        year_dir = make_single_directory(to_dir, year)
        desy_dir = make_single_directory(year_dir, 'DESY')
        individual_dir = make_single_directory(desy_dir, number)
        id_no = year[2:] + '-' + number + '.pdf'
        url_for_file = CFG_PLOTEXTRACTOR_DESY_BASE + year + \
                       CFG_PLOTEXTRACTOR_DESY_PIECE + id_no
        individual_file = id_no
        abs_path = os.path.join(individual_dir, individual_file)
        write_message('download ' + url_for_file + ' to ' + abs_path)
        try:
            abs_path = download_url(url=url_for_file,
                                    content_type='pdf',
                                    download_to_file=abs_path)
        except InvenioFileDownloadError:
            abs_path = None
        return (None, abs_path)
    write_message('END')
    return (None, None)
Ejemplo n.º 11
0
def new_URL_harvest(from_date, from_index, to_dir):
    """
        Grab all the PDFs and tarballs off arXiv between from_date and to_date,
        where from_date and to_date are in YYMM form, and put them in their own
        separate folders inside of to_dir.  Folder hierarchy will be
            to_dir/YYYY/MM/arXiv_id/stuff_downloaded_from_arXiv
        this obeys the new URL format

        @param: from_date (int): YYMM form of the date where we want to start
            harvesting
        @param: to_date (int): YYMM form of the date where we want to stop
            harvesting
        @param: to_dir (string): the base directory to put all these subdirs in

        @output: PDFs and tarballs from arXiv in a hierarchy rooted at to_dir
        @return: None
    """

    global current_yearmonth
    yearmonthindex = from_date

    while yearmonthindex < current_yearmonth:

        if yearmonthindex == from_date:
            fro = from_index
        else:
            fro = 1

        sub_dir = make_useful_directories(yearmonthindex, to_dir)

        for paperindex in range(fro, 10000):

            # of note: after the URL change happened in 0704, it was
            # the case that paper numbers had 4 digits
            next_to_harvest = '%04d.%04d' % (yearmonthindex, paperindex)
            arXiv_id = ARXIV_HEADER + next_to_harvest
            individual_dir = make_single_directory(sub_dir, arXiv_id)

            full_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       next_to_harvest
            abs_path = os.path.join(individual_dir,
                                    ARXIV_HEADER + next_to_harvest)
            if not download_url(url=full_url,
                                content_type='tar',
                                download_to_file=abs_path):
                break

            full_pdf_url = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                           next_to_harvest
            abs_path = os.path.join(
                individual_dir, ARXIV_HEADER + next_to_harvest + PDF_EXTENSION)
            download_url(url=full_pdf_url,
                         content_type='pdf',
                         download_to_file=abs_path)
            time.sleep(
                CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT)  # be nice to remote server

        if yearmonthindex % 100 == 12:
            # we reached the end of the year!
            yearmonthindex = yearmonthindex + FIX_FOR_YEAR_END
        yearmonthindex = yearmonthindex + 1
Ejemplo n.º 12
0
def harvest_single(single, to_dir, selection=("tarball", "pdf")):
    """
    if we only want to harvest one id (arXiv or DESY), we can use this.

    @param: single (string): an id from arXiv or DESY
    @param: to_dir (string): where the output should be saved

    @output: the PDF and source tarball (if applicable) of this single record

    @return: (tarball, pdf): the location of the source tarball and PDF, None
            if not found
    """

    if single.find(
            'arXiv'
    ) > -1 and 'arxiv.org' in CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.lower():
        id_str = re.findall('[a-zA-Z\\-]+/\\d+|\\d+\\.\\d+', single)[0]
        idno = id_str.split('/')
        if len(idno) > 0:
            idno = idno[-1]
        yymm = int(idno[:4])
        yymm_dir = make_useful_directories(yymm, to_dir)
        url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \
                       CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       id_str
        url_for_pdf = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \
                      CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                      id_str + '.pdf' # adds '.pdf' to avoid arXiv internal redirect from arXivID to arXivID.pdf
        individual_file = 'arXiv:' + id_str.replace('/', '_')
        individual_dir = make_single_directory(yymm_dir, individual_file)
        abs_path = os.path.join(individual_dir, individual_file)
        tarball = abs_path
        pdf = abs_path + '.pdf'

        try:
            if "tarball" in selection:
                write_message('downloading ' + url_for_file + ' to ' + tarball)
                tarball = download_url(url=url_for_file,
                                       content_type='tar',
                                       download_to_file=tarball)
        except InvenioFileDownloadError:
            tarball = None

        try:
            if "pdf" in selection:
                write_message('downloading ' + url_for_pdf + ' to ' + pdf)
                pdf = download_url(url=url_for_pdf,
                                   content_type="pdf",
                                   download_to_file=pdf)
        except InvenioFileDownloadError:
            pdf = None

        return (tarball, pdf)

    elif single.find('arXiv') > -1 and CFG_PLOTEXTRACTOR_SOURCE_BASE_URL != '':
        # hmm... is it a filesystem?
        if CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith('/'):
            if not os.path.exists(CFG_PLOTEXTRACTOR_SOURCE_BASE_URL):
                write_message('PROBLEM WITH CFG_PLOTEXTRACTOR_SOURCE_BASE_URL: we cannot ' + \
                        'find this folder!')
                return (None, None)
            for root, files, dummy in os.walk(
                    CFG_PLOTEXTRACTOR_SOURCE_BASE_URL):
                for file_name in files:
                    id_no = single.replace('arXiv', '')
                    if file_name.find(id_no) > -1 or\
                       file_name.find(id_no.replace('/', '_')) > -1 or\
                       file_name.find(id_no.replace('_', '/')) > -1 or\
                       file_name.find(id_no.replace(':', '')) > -1:
                        # that's our file!  probably.
                        return (os.path.join(root, file_name), None)

            # well, no luck there
            return (None, None)

        # okay... is it... a website?
        elif CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith(
                'http') and "tarball" in selection:
            url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + single
            individual_file = os.path.join(to_dir, single)
            abs_path = os.path.join(to_dir, individual_file)
            try:
                abs_path = download_url(url=url_for_file,
                                        content_type='tar',
                                        download_to_file=abs_path)
            except InvenioFileDownloadError:
                abs_path = None
            return (abs_path, None)

        # well, I don't know what to do with it
        else:
            write_message('unsure how to handle CFG_PLOTEXTRACTOR_SOURCE_BASE_URL. ' + \
                  'please fix the harvest_single function in ' + \
                  'miscutil/lib/plotextractor_getter.py')
            return (None, None)

    elif single.find('DESY') > -1 and "pdf" in selection:
        # also okay!
        idno = re.findall('\\d{2,4}-\\d{3}', single)[0]
        year, number = idno.split('-')
        if len(year) < 4:
            if int(year) > 92:
                year = '19' + year
            else:
                year = '20' + year
        year_dir = make_single_directory(to_dir, year)
        desy_dir = make_single_directory(year_dir, 'DESY')
        individual_dir = make_single_directory(desy_dir, number)
        id_no = year[2:] + '-' + number + '.pdf'
        url_for_file = CFG_PLOTEXTRACTOR_DESY_BASE + year + \
                       CFG_PLOTEXTRACTOR_DESY_PIECE + id_no
        individual_file = id_no
        abs_path = os.path.join(individual_dir, individual_file)
        write_message('download ' + url_for_file + ' to ' + abs_path)
        try:
            abs_path = download_url(url=url_for_file,
                                    content_type='pdf',
                                    download_to_file=abs_path)
        except InvenioFileDownloadError:
            abs_path = None
        return (None, abs_path)
    write_message('END')
    return (None, None)