def download(url, filename, to_dir):
    """
        Actually does the call and download given a URL and desired output
        filename.

        @param: url (string): where the file lives on the interwebs
        @param: filename (string): where the file should live after download
        @param: to_dir (string): the dir where our new files will live

        @output: a file in to_dir

        @return: True on success, False on failure
    """
    new_file = os.path.join(to_dir, filename)

    try:
        urllib.urlretrieve(url, new_file)
        write_message('Downloaded to ' + new_file)
        time.sleep(5) # be nice to remote server
        return True
    except IOError:
        # this could be a permissions error, but it probably means that
        # there's nothing left in that section YYMM
        write_message('Nothing at ' + new_file)
        return False
def download(url, filename, to_dir):
    """
        Actually does the call and download given a URL and desired output
        filename.

        @param: url (string): where the file lives on the interwebs
        @param: filename (string): where the file should live after download
        @param: to_dir (string): the dir where our new files will live

        @output: a file in to_dir

        @return: True on success, False on failure
    """
    new_file = os.path.join(to_dir, filename)

    try:
        urllib.urlretrieve(url, new_file)
        write_message("Downloaded to " + new_file)
        time.sleep(7)  # be nice to arXiv!
        return True
    except IOError:
        # this could be a permissions error, but it probably means that
        # there's nothing left in that section YYMM
        write_message("Nothing at " + new_file)
        return False
Example #3
0
def download(url, filename, to_dir):
    """
        Actually does the call and download given a URL and desired output
        filename.

        @param: url (string): where the file lives on the interwebs
        @param: filename (string): where the file should live after download
        @param: to_dir (string): the dir where our new files will live

        @output: a file in to_dir

        @return: True on success, False on failure
    """
    new_file = os.path.join(to_dir, filename)

    try:
        conn = PLOTEXTRACTOR_OPENER.open(url)
        response = conn.read()
        conn.close()
        new_file_fd = open(new_file, 'w')
        new_file_fd.write(response)
        new_file_fd.close()
        write_message('Downloaded to ' + new_file)
        return True
    except (IOError, urllib2.URLError), e:
        # this could be a permissions error, but it probably means that
        # there's nothing left in that section YYMM
        write_message('Error downloading from %s: \n%s\n' % (url, str(e)))
        return False
Example #4
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], param_abbrs, params)
    except getopt.GetoptError, err:
        write_message(str(err))
        usage()
        sys.exit(2)
Example #5
0
def get_defaults(tarball, sdir, refno_url):
    """
    A function for parameter-checking.

    @param: tarball (string): the location of the tarball to be extracted
    @param: sdir (string): the location of the scratch directory for untarring,
        conversions, and the ultimate destination of the MARCXML
    @param: refno_url (string): server location on where to look for refno

    @return sdir, refno (string, string): the same
        arguments it was sent as is appropriate.
    """

    if sdir == None:
        # Missing sdir: using default directory: CFG_TMPDIR
        sdir = CFG_TMPDIR
    else:
        sdir = os.path.split(tarball)[0]

    # make a subdir in the scratch directory for each tarball
    sdir = make_single_directory(sdir, \
                                 os.path.split(tarball)[-1] + '_' + PLOTS_DIR)
    if refno_url != "":
        refno = get_reference_number(tarball, refno_url)
        if refno == None:
            refno = os.path.basename(tarball)
            write_message('Error: can\'t find record id for %s' % (refno,))
    else:
        refno = os.path.basename(tarball)
        write_message("Skipping ref-no check")
    return sdir, refno
Example #6
0
def get_defaults(tarball, sdir):
    '''
    A function for parameter-checking.

    @param: tarball (string): the location of the tarball to be extracted
    @param: sdir (string): the location of the scratch directory for untarring,
        conversions, and the ultimate destination of the MARCXML

    @return sdir, refno (string, string): the same
        arguments it was sent as is appropriate.
    '''

    if sdir == None:
        write_message('using default directory: ' + CFG_TMPDIR +\
             ' for scratchwork')
        sdir = CFG_TMPDIR

    else:
        sdir = os.path.split(tarball)[0]

    # make a subdir in the scratch directory for each tarball
    sdir = make_single_directory(sdir, \
                                 os.path.split(tarball)[-1] + '_' + PLOTS_DIR)

    arXiv_id = os.path.split(tarball)[-1]

    refno = get_reference_number(tarball)
    if refno == tarball:
        write_message('can\'t find record id for ' + arXiv_id)

    return sdir, refno
def download(url, filename, to_dir):
    """
        Actually does the call and download given a URL and desired output
        filename.

        @param: url (string): where the file lives on the interwebs
        @param: filename (string): where the file should live after download
        @param: to_dir (string): the dir where our new files will live

        @output: a file in to_dir

        @return: True on success, False on failure
    """
    new_file = os.path.join(to_dir, filename)

    try:
        conn = urllib2.urlopen(url)
        response = conn.read()
        conn.close()
        new_file_fd = open(new_file, 'w')
        new_file_fd.write(response)
        new_file_fd.close()
        write_message('Downloaded to ' + new_file)
        return True
    except (IOError, urllib2.URLError), e:
        # this could be a permissions error, but it probably means that
        # there's nothing left in that section YYMM
        write_message('Error downloading from %s: \n%s\n' % (url, str(e)))
        return False
Example #8
0
def main():
    """
    The main program loop.
    """
    help_param = 'help'
    verbose_param = 'verbose'
    tarball_param = 'tarball'
    tardir_param = 'tdir'
    infile_param = 'input'
    sdir_param = 'sdir'
    extract_text_param = 'extract-text'
    force_param = 'force'
    upload_param = 'call-bibupload'
    yes_i_know_param = 'yes-i-know'
    recid_param = 'recid'
    arXiv_param = 'arXiv'
    squash_param = 'squash'
    refno_url_param = 'refno-url'
    refno_param = 'skip-refno'
    clean_param = 'clean'
    param_abbrs = 'h:t:d:s:i:a:l:xfuyrqck'
    params = [help_param, tarball_param + '=', tardir_param + '=', \
              sdir_param + '=', infile_param + '=', arXiv_param + '=', refno_url_param + '=', \
              extract_text_param, force_param, upload_param, yes_i_know_param, recid_param, \
              squash_param, clean_param]
    try:
        opts, args = getopt.getopt(sys.argv[1:], param_abbrs, params)
    except getopt.GetoptError, err:
        write_message(str(err))
        usage()
        sys.exit(2)
Example #9
0
def main():
    """
    The main program loop.
    """
    help_param = "help"
    verbose_param = "verbose"
    tarball_param = "tarball"
    tardir_param = "tdir"
    infile_param = "input"
    sdir_param = "sdir"
    extract_text_param = "extract-text"
    force_param = "force"
    upload_param = "call-bibupload"
    upload_mode_param = "upload-mode"
    yes_i_know_param = "yes-i-know"
    recid_param = "recid"
    with_docname_param = "with-docname"
    with_doctype_param = "with-doctype"
    with_docformat_param = "with-docformat"
    arXiv_param = "arXiv"
    squash_param = "squash"
    refno_url_param = "refno-url"
    refno_param = "skip-refno"
    clean_param = "clean"
    param_abbrs = "h:t:d:s:i:a:l:xfuyr:qck"
    params = [
        help_param,
        tarball_param + "=",
        tardir_param + "=",
        sdir_param + "=",
        infile_param + "=",
        arXiv_param + "=",
        refno_url_param + "=",
        extract_text_param,
        force_param,
        upload_param,
        yes_i_know_param,
        recid_param + "=",
        squash_param,
        clean_param,
        refno_param,
        with_docname_param + "=",
        with_doctype_param + "=",
        with_docformat_param + "=",
        upload_mode_param + "=",
    ]
    try:
        opts, args = getopt.getopt(sys.argv[1:], param_abbrs, params)
    except getopt.GetoptError, err:
        write_message(str(err))
        usage()
        sys.exit(2)
def convert_images(image_list):
    '''
    Here we figure out the types of the images that were extracted from
    the tarball and determine how to convert them into PNG.

    @param: image_list ([string, string, ...]): the list of image files
        extracted from the tarball in step 1

    @return: image_list ([str, str, ...]): The list of image files when all
        have been converted to PNG format.
    '''

    png_output_contains = 'PNG image data'
    ps_output_contains = 'Postscript'
    eps_output_contains = 'PostScript'

    ret_list = []

    for image_file in image_list:
        if os.path.isdir(image_file):
            continue

        # FIXME: here and everywhere else in the plot extractor
        # library the run shell command statements should be (1)
        # called with timeout in order to prevent runaway imagemagick
        # conversions; (2) the arguments should be passed properly so
        # that they are escaped.

        dummy1, cmd_out, dummy2 = run_shell_command('file ' + image_file)
        if cmd_out.find(png_output_contains) > -1:
            ret_list.append(image_file)
        else:
            # we're just going to assume that ImageMagick can convert all
            # the image types that we may be faced with
            # for sure it can do EPS->PNG and JPG->PNG and PS->PNG
            # and PSTEX->PNG
            converted_image_file = get_converted_image_name(image_file)

            convert_cmd = 'convert '

            dummy1, cmd_out, cmd_err = run_shell_command(convert_cmd +\
                    image_file + ' ' + converted_image_file)
            if cmd_err == '':
                ret_list.append(converted_image_file)
            else:
                write_message('convert failed on ' + image_file)

    return ret_list
def extract_text(tarball):
    """
    We check to see if there's a file called tarball.pdf, and, if there is,
    we run pdftotext on it.  Simple as that.

    @param: tarball (string): the raw name of the tarball

    @return: None
    """
    try:
        os.stat(tarball + '.pdf')
        dummy1, dummy2, cmd_err = run_process_with_timeout('pdftotext %s %s' % \
                                     (tarball + '.pdf ', tarball + '.txt'), shell = True)
        if cmd_err != '':
            return - 1
        write_message('generated ' + tarball + '.txt from ' + tarball + '.pdf')
    except:
        write_message('no text from ' + tarball + '.pdf')
Example #12
0
def extract_text(tarball):
    """
    We check to see if there's a file called tarball.pdf, and, if there is,
    we run pdftotext on it.  Simple as that.

    @param: tarball (string): the raw name of the tarball

    @return: None
    """
    try:
        os.stat(tarball + '.pdf')
        cmd_list = ['pdftotext', tarball + '.pdf ', tarball + '.txt']
        dummy1, dummy2, cmd_err = run_process_with_timeout(cmd_list)
        if cmd_err != '':
            return -1
        write_message('generated ' + tarball + '.txt from ' + tarball + '.pdf')
    except:
        write_message('no text from ' + tarball + '.pdf')
def convert_images(image_list):
    """
    Here we figure out the types of the images that were extracted from
    the tarball and determine how to convert them into PNG.

    @param: image_list ([string, string, ...]): the list of image files
        extracted from the tarball in step 1

    @return: image_list ([str, str, ...]): The list of image files when all
        have been converted to PNG format.
    """
    png_output_contains = 'PNG image'
    ret_list = []
    for image_file in image_list:
        if os.path.isdir(image_file):
            continue

        # FIXME: here and everywhere else in the plot extractor
        # library the run shell command statements should be (1)
        # called with timeout in order to prevent runaway imagemagick
        # conversions; (2) the arguments should be passed properly so
        # that they are escaped.

        dummy1, cmd_out, dummy2 = run_shell_command('file %s', (image_file,))
        if cmd_out.find(png_output_contains) > -1:
            ret_list.append(image_file)
        else:
            # we're just going to assume that ImageMagick can convert all
            # the image types that we may be faced with
            # for sure it can do EPS->PNG and JPG->PNG and PS->PNG
            # and PSTEX->PNG
            converted_image_file = get_converted_image_name(image_file)
            try:
                dummy1, cmd_out, cmd_err = run_process_with_timeout('convert %s %s'\
                                                   % (image_file, \
                                                      converted_image_file), shell = True)
                if cmd_err == '':
                    ret_list.append(converted_image_file)
                else:
                    write_message('convert failed on ' + image_file)
            except Timeout:
                write_message('convert timed out on ' + image_file)

    return ret_list
def harvest_from_file(filename, to_dir):
    """
    Harvest from the file Tibor made.
    Format of a single entry:
        oai:arXiv.org:area/YYMMIII
            or
        oai:arXiv.org:YYMM.IIII
    """

    ok_format = '^oai:arXiv.org:(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))$'

    try:
        names_file = open(filename)
        for arXiv_name in names_file.readlines():
            if re.match(ok_format, arXiv_name) == None:
                write_message('error on ' + arXiv_name + '. continuing.')
                continue
            harvest_single(arXiv_name, to_dir)

    except IOError:
        write_message('Something is wrong with the file!')
def harvest_from_file(filename, to_dir):
    """
    Harvest from the file Tibor made.
    Format of a single entry:
        oai:arXiv.org:area/YYMMIII
            or
        oai:arXiv.org:YYMM.IIII
    """

    ok_format = "^oai:arXiv.org:(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))$"

    try:
        names_file = open(filename)
        for arXiv_name in names_file.readlines():
            if re.match(ok_format, arXiv_name) == None:
                write_message("error on " + arXiv_name + ". continuing.")
                continue
            harvest_single(arXiv_name, to_dir)

    except IOError:
        write_message("Something is wrong with the file!")
def parse_and_download(infile, sdir):
    """
    Read the write_messageation in the input file and download the corresponding
    tarballs from arxiv.

    @param: infile (string): the name of the file to parse
    @param: sdir (string): where to put the downloaded tarballs
    """

    tarfiles = []

    tardir = os.path.join(sdir, 'tarballs')
    if not os.path.isdir(tardir):
        try:
            os.makedirs(tardir)
        except:
            write_message(sys.exc_info()[0])
            write_message('files will be loose, not in ' + tardir)
            tardir = sdir

    infile = open(infile)
    for line in infile.readlines():
        line = line.strip()
        if line.startswith('http://'):
            # hurray!
            try:
                url = line
                filename = url.split('/')[-1]
                filename = os.path.join(tardir, filename)
                urllib.urlretrieve(url, filename)
                tarfiles.append(filename)
                write_message('Downloaded to ' + filename)
                time.sleep(7) # be nice!
            except:
                write_message(filename + ' may already exist')
                write_message(sys.exc_info()[0])
        elif line.startswith('arXiv'):
            tarfiles.extend(tarballs_by_arXiv_id([line.strip()], sdir))

    return tarfiles
def parse_and_download(infile, sdir):
    """
    Read the write_messageation in the input file and download the corresponding
    tarballs from arxiv.

    @param: infile (string): the name of the file to parse
    @param: sdir (string): where to put the downloaded tarballs
    """

    tarfiles = []

    tardir = os.path.join(sdir, "tarballs")
    if not os.path.isdir(tardir):
        try:
            os.makedirs(tardir)
        except:
            write_message(sys.exc_info()[0])
            write_message("files will be loose, not in " + tardir)
            tardir = sdir

    infile = open(infile)
    for line in infile.readlines():
        line = line.strip()
        if line.startswith("http://"):
            # hurray!
            try:
                url = line
                filename = url.split("/")[-1]
                filename = os.path.join(tardir, filename)
                urllib.urlretrieve(url, filename)
                tarfiles.append(filename)
                write_message("Downloaded to " + filename)
                time.sleep(7)  # be nice!
            except:
                write_message(filename + " may already exist")
                write_message(sys.exc_info()[0])
        elif line.startswith("arXiv"):
            tarfiles.extend(tarballs_by_arXiv_id([line.strip()], sdir))

    return tarfiles
def make_single_directory(to_dir, dirname):
    """
        Makes a subdirectory for the arXiv record we are working with and
        returns its exact location.

        @param: to_dir (string): the name of the directory we want to make it
            in
        @param: dirname (string): the name of the directory we want to create

        @output: a new directory called dirname located in to_dir
        @return: the absolute path to the new directory
    """
    new_dir = os.path.join(to_dir, dirname)

    if not os.path.isdir(new_dir):
        try:
            os.mkdir(new_dir)
        except OSError:
            write_message('Failed to make new dir...')
            return to_dir

    return new_dir
def make_single_directory(to_dir, dirname):
    """
        Makes a subdirectory for the arXiv record we are working with and
        returns its exact location.

        @param: to_dir (string): the name of the directory we want to make it
            in
        @param: dirname (string): the name of the directory we want to create

        @output: a new directory called dirname located in to_dir
        @return: the absolute path to the new directory
    """
    new_dir = os.path.join(to_dir, dirname)

    if not os.path.isdir(new_dir):
        try:
            os.mkdir(new_dir)
        except OSError:
            write_message("Failed to make new dir...")
            return to_dir

    return new_dir
def harvest_from_file(filename, to_dir):
    """
    Harvest from the file Tibor made.
    Format of a single entry:
        oai:arXiv.org:area/YYMMIII
            or
        oai:arXiv.org:YYMM.IIII
    """

    ok_format = '^oai:arXiv.org:(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))$'

    try:
        names_file = open(filename)
        for arXiv_name in names_file.readlines():
            if re.match(ok_format, arXiv_name) == None:
                write_message('error on ' + arXiv_name + '. continuing.')
                continue
            harvest_single(arXiv_name, to_dir)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT)

    except IOError:
        write_message('Something is wrong with the file!')
Example #21
0
def check_for_gzip(tfile):
    """
    Was that tarball also gzipped?  Let's find out!

    @param: file (string): the name of the object (so we can gunzip, if
        that's necessary)

    @output: a gunzipped file in the directory of choice, if that's necessary

    @return new_file (string): The name of the file after gunzipping or the
        original name of the file if that wasn't necessary
    """

    gzip_contains = 'gzip compressed data'
    dummy1, cmd_out, dummy2 = run_shell_command('file %s', (tfile, ))

    if cmd_out.find(gzip_contains) > -1:
        # we have a gzip!
        # so gzip is retarded and won't accept any file that doesn't end
        # with .gz.  sad.
        run_shell_command('cp %s %s' % (tfile, tfile + '.tar.gz'))
        new_dest = os.path.join(os.path.split(tfile)[0], 'tmp.tar')
        run_shell_command('touch %s' % (new_dest, ))
        dummy1, cmd_out, cmd_err = run_shell_command('gunzip -c %s' % \
                                                            (tfile + '.tar.gz',))
        if cmd_err != '':
            write_message('Error while gunzipping ' + tfile)
            return tfile

        tarfile = open(new_dest, 'w')
        tarfile.write(cmd_out)
        tarfile.close()

        run_shell_command('rm %s', (tfile + '.tar.gz', ))
        return new_dest

    return tfile
def check_for_gzip(tfile):
    """
    Was that tarball also gzipped?  Let's find out!

    @param: file (string): the name of the object (so we can gunzip, if
        that's necessary)

    @output: a gunzipped file in the directory of choice, if that's necessary

    @return new_file (string): The name of the file after gunzipping or the
        original name of the file if that wasn't necessary
    """

    gzip_contains = 'gzip compressed data'
    dummy1, cmd_out, dummy2 = run_shell_command('file %s', (tfile,))

    if cmd_out.find(gzip_contains) > -1:
        # we have a gzip!
        # so gzip is retarded and won't accept any file that doesn't end
        # with .gz.  sad.
        run_shell_command('cp %s %s' % (tfile, tfile + '.tar.gz'))
        new_dest = os.path.join(os.path.split(tfile)[0], 'tmp.tar')
        run_shell_command('touch %s' % (new_dest,))
        dummy1, cmd_out, cmd_err = run_shell_command('gunzip -c %s' % \
                                                            (tfile + '.tar.gz',))
        if cmd_err != '':
            write_message('Error while gunzipping ' + tfile)
            return tfile

        tarfile = open(new_dest, 'w')
        tarfile.write(cmd_out)
        tarfile.close()

        run_shell_command('rm %s', (tfile + '.tar.gz',))
        return new_dest

    return tfile
Example #23
0
def process_single(tarball, sdir=CFG_TMPDIR, xtract_text=False, \
                   upload_plots=True, force=False, squash=False):
    '''
    Processes one tarball end-to-end.

    @param: tarball (string): the absolute location of the tarball we wish
        to process
    @param: sdir (string): where we should put all the intermediate files for
        the processing.  if you're uploading, this directory should be one
        of the ones specified in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS, else
        the upload won't work
    @param: xtract_text (boolean): true iff you want to run pdftotext on the
        pdf versions of the tarfiles.  this programme assumes that the pdfs
        are named the same as the tarballs but with a .pdf extension.
    @param: upload_plots (boolean): true iff you want to bibupload the plots
        extracted by this process

    @return: None
    '''

    sub_dir, refno = get_defaults(tarball, sdir)

    if not squash:
        marc_name = os.path.join(sub_dir, refno + '.xml')
    else:
        marc_name = os.path.join(sdir, SQUASHED_FILE)

    if (force or not os.path.exists(marc_name)) and not squash:
        open(marc_name, 'w').close()

    if xtract_text:
        extract_text(tarball)

    image_list, tex_files = untar(tarball, sub_dir)
    if tex_files == [] or tex_files == None:
        write_message(os.path.split(tarball)[-1] + ' is not a tarball')
        run_shell_command('rm -r ' + sub_dir)
        return

    converted_image_list = convert_images(image_list)

    images_and_captions_and_labels = [['','', []]]
    for tex_file in tex_files:
        images_and_captions_and_labels.extend(extract_captions(tex_file,
                                               sub_dir, converted_image_list))

    marc_name = create_MARC(images_and_captions_and_labels, tex_files[0],
                            refno, converted_image_list, marc_name)
    if marc_name != None and not squash:
        write_message('generated ' + marc_name)
        if upload_plots:
            upload_to_site(marc_name)

    clean_up(image_list)

    write_message('work complete on ' + os.path.split(tarball)[-1])
Example #24
0
def parse_and_download(infile, sdir):
    """
    Read the write_messageation in the input file and download the corresponding
    tarballs from arxiv.

    @param: infile (string): the name of the file to parse
    @param: sdir (string): where to put the downloaded tarballs
    """

    tarfiles = []

    tardir = os.path.join(sdir, 'tarballs')
    if not os.path.isdir(tardir):
        try:
            os.makedirs(tardir)
        except:
            write_message(sys.exc_info()[0])
            write_message('files will be loose, not in ' + tardir)
            tardir = sdir

    infile = open(infile)
    for line in infile.readlines():
        line = line.strip()
        if line.startswith('http://'):
            # hurray!
            url = line
            filename = url.split('/')[-1]
            abs_path = os.path.join(tardir, filename)
            if not download_url(url=url,
                                content_type='tar',
                                download_to_file=abs_path):
                write_message(filename + ' may already exist')
                write_message(sys.exc_info()[0])
            filename = os.path.join(tardir, filename)
            tarfiles.append(filename)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) # be nice!
        elif line.startswith('arXiv'):
            tarfiles.extend(tarballs_by_arXiv_id([line.strip()], sdir))

    return tarfiles
Example #25
0
def parse_and_download(infile, sdir):
    """
    Read the write_messageation in the input file and download the corresponding
    tarballs from arxiv.

    @param: infile (string): the name of the file to parse
    @param: sdir (string): where to put the downloaded tarballs
    """

    tarfiles = []

    tardir = os.path.join(sdir, 'tarballs')
    if not os.path.isdir(tardir):
        try:
            os.makedirs(tardir)
        except:
            write_message(sys.exc_info()[0])
            write_message('files will be loose, not in ' + tardir)
            tardir = sdir

    infile = open(infile)
    for line in infile.readlines():
        line = line.strip()
        if line.startswith('http://'):
            # hurray!
            url = line
            filename = url.split('/')[-1]
            abs_path = os.path.join(tardir, filename)
            if not download_url(url=url,
                                content_type='tar',
                                download_to_file=abs_path):
                write_message(filename + ' may already exist')
                write_message(sys.exc_info()[0])
            filename = os.path.join(tardir, filename)
            tarfiles.append(filename)
            time.sleep(CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT) # be nice!
        elif line.startswith('arXiv'):
            tarfiles.extend(tarballs_by_arXiv_id([line.strip()], sdir))

    return tarfiles
Example #26
0
def parse_and_download(infile, sdir):
    '''
    Read the write_messageation in the input file and download the corresponding
    tarballs from arxiv.

    @param: infile (string): the name of the file to parse
    @param: sdir (string): where to put the downloaded tarballs
    '''

    tarfiles = []

    tardir = os.path.join(sdir, 'tarballs')
    if not os.path.isdir(tardir):
        try:
            os.makedirs(tardir)
        except:
            write_message(sys.exc_info()[0])
            write_message('files will be loose, not in ' + tardir)
            tardir = sdir

    infile = open(infile)
    for line in infile.readlines():
        line = line.strip()
        if line.startswith('http://'):
            # hurray!
            try:
                url = line
                filename = url.split('/')[-1]
                filename = os.path.join(tardir, filename)
                urllib.urlretrieve(url, filename)
                tarfiles.append(filename)
            except:
                write_message(filename + ' may already exist')
                write_message(sys.exc_info()[0])

    return tarfiles
def harvest(to_dir, from_date, from_index):
    """
        Calls upon arXiv using URLS as described above in order to grab
        all the tarballs from HEP areas.

        @param: dir (string): the directory where everything that gets
            downloaded will sit
        @param: from_date (int): the date from which we would like to harvest,
            in YYMM format
        @param: from_index (int): the index where we want to begin our harvest
            in YYMM.  i.e. we want to start with the 345th record in 1002.

        @output: TONS OF .tar.gz FILES FROM ARXIV
        @return: (none)
    """

    global current_yearmonth

    if from_date > current_yearmonth and from_date < ARBITRARY_FROM_DATE:
        write_message('Please choose a from date that is not in the future!')
        sys.exit(1)
    if from_date % 100 > 12:
        write_message('Please choose a from date in the form YYMM')
        sys.exit(1)

    if from_date >= ARBITRARY_FROM_DATE or from_date < URL_MOVE:
        for area in HEP_AREAS:

            yearmonthindex = area[BEGIN_YEAR_MONTH_INDEX]

            # nasty casing!
            # I find this particularly horrid because we have to wrap dates..
            # i.e. although 9901 is more than 0001, we might want things in
            # 0001 and not from 9901
            if from_date < current_yearmonth:
                # we want to start in the new century; skip the while below
                yearmonthindex = CENTURY_END
            elif from_date < CENTURY_END:
                yearmonthindex = from_date

            # grab stuff from between 92 and 99
            old_URL_harvest(yearmonthindex, CENTURY_END, to_dir, area)

            yearmonthindex = CENTURY_BEGIN

            # more nasty casing
            if from_date < URL_MOVE:
                # that means we want to start sometime before the weird
                # url change
                yearmonthindex = from_date
            elif from_date > URL_MOVE and from_date < ARBITRARY_FROM_DATE:
                # we don't want to start yet
                yearmonthindex = URL_MOVE

            # grab stuff from between 00 and 07
            old_URL_harvest(yearmonthindex, URL_MOVE, to_dir, area)

    # also after the URL move, there was no distinction between
    # papers from different areas.  hence, outside the for loop

    # even more nasty casing!
    if from_date < current_yearmonth and from_date > URL_MOVE:
        # we want to start someplace after the URL move and before now
        yearmonthindex = from_date
    else:
        yearmonthindex = URL_MOVE

    # grab stuff from between 07 and today
    new_URL_harvest(yearmonthindex, from_index, to_dir)
Example #28
0
def extract_captions(tex_file, sdir, image_list, main=True):
    '''
    Take the TeX file and the list of images in the tarball (which all,
    presumably, are used in the TeX file) and figure out which captions
    in the text are associated with which images

    @param: tex_file (string): the name of the TeX file which mentions
        the images

    @return: images_and_captions_and_labels ([(string, string, string),
        (string, string, string), ...]):
        a list of tuples representing the names of images and their
        corresponding figure labels from the TeX file
    '''

    if os.path.isdir(tex_file):
        return []

    tex = open(tex_file)
    # possible figure lead-ins
    figure_head = '\\begin{figure'  # also matches figure*
    figure_tail = '\\end{figure'  # also matches figure*
    picture_head = '\\begin{picture}'
    displaymath_head = '\\begin{displaymath}'
    subfloat_head = '\\subfloat'
    subfig_head = '\\subfigure'
    includegraphics_head = '\\includegraphics'
    special_head = '\\special'
    epsfig_head = '\\epsfig'
    input_head = '\\input'
    # possible caption lead-ins
    caption_head = '\\caption'
    figcaption_head = '\\figcaption'

    label_head = '\\label'
    ref_head = '\\ref{'

    rotate = 'rotate='
    angle = 'angle='

    eps_tail = '.eps'
    ps_tail = '.ps'

    doc_head = '\\begin{document}'
    doc_tail = '\\end{document}'

    images_and_captions_and_labels = []
    cur_image = ''
    caption = ''
    label = []

    lines = tex.readlines()
    tex.close()

    # cut out shit before the doc head
    if main:
        for line_index in range(len(lines)):
            if lines[line_index].find(doc_head) < 0:
                lines[line_index] = ''
            else:
                break

    # are we using commas in filenames here?
    commas_okay = False
    for dirpath, dummy0, filenames in \
            os.walk(os.path.split(os.path.split(tex_file)[0])[0]):
        for filename in filenames:
            if filename.find(',') > -1:
                commas_okay = True
                break

    # a comment is a % not preceded by a \
    comment = re.compile("(?<!\\\\)%")

    for line_index in range(len(lines)):
        # get rid of pesky comments by splitting where the comment is
        # and keeping only the part before the %
        line = comment.split(lines[line_index])[0]
        line = line.strip()
        lines[line_index] = line

    in_figure_tag = 0

    for line_index in range(len(lines)):
        line = lines[line_index]

        if line == '':
            continue
        if line.find(doc_tail) > -1:
           return images_and_captions_and_labels

        '''
        FIGURE -
        structure of a figure:
        \begin{figure}
        \formatting...
        \includegraphics[someoptions]{FILENAME}
        \caption{CAPTION}  %caption and includegraphics may be switched!
        \end{figure}
        '''

        index = line.find(figure_head)
        if index > -1:
            in_figure_tag = 1
            # some punks don't like to put things in the figure tag.  so we
            # just want to see if there is anything that is sitting outside
            # of it when we find it
            cur_image, caption, label, images_and_captions_and_labels =\
                    put_it_together(cur_image, caption, label,\
                                    images_and_captions_and_labels,\
                                    line_index, lines, tex_file)

        # here, you jerks, just make it so that it's fecking impossible to
        # figure out your damn inclusion types

        index = max([line.find(eps_tail), line.find(ps_tail),\
                     line.find(epsfig_head)])
        if index > -1:
            if line.find(eps_tail) > -1 or line.find(ps_tail) > -1:
                ext = True
            else:
                ext = False
            filenames = intelligently_find_filenames(line, ext=ext,
                                                     commas_okay=commas_okay)

            # try to look ahead!  sometimes there are better matches after
            if line_index < len(lines) - 1:
                filenames.extend(\
                          intelligently_find_filenames(lines[line_index + 1],
                                                      commas_okay=commas_okay))
            if line_index < len(lines) - 2:
                filenames.extend(\
                          intelligently_find_filenames(lines[line_index + 2],
                                                      commas_okay=commas_okay))

            for filename in filenames:
                filename = str(filename)
                if cur_image == '':
                    cur_image = filename
                elif type(cur_image) == list:
                    if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list:
                        cur_image[SUB_CAPTION_OR_IMAGE].append(filename)
                    else:
                        cur_image[SUB_CAPTION_OR_IMAGE] = [filename]
                else:
                    cur_image = ['', [cur_image, filename]]

        '''
        Rotate and angle
        '''
        index = max(line.find(rotate), line.find(angle))
        if index > -1:
            # which is the image associated to it?
            filenames = intelligently_find_filenames(line,
                                                     commas_okay=commas_okay)
            # try the line after and the line before
            filenames.extend(intelligently_find_filenames(lines[line_index+1],
                                                      commas_okay=commas_okay))
            filenames.extend(intelligently_find_filenames(lines[line_index-1],
                                                      commas_okay=commas_okay))

            already_tried = []
            for filename in filenames:
                if filename != 'ERROR' and not filename in already_tried:
                    if rotate_image(filename, line, sdir, image_list):
                        break
                    already_tried.append(filename)

        '''
        INCLUDEGRAPHICS -
        structure of includegraphics:
        \includegraphics[someoptions]{FILENAME}
        '''
        index = line.find(includegraphics_head)
        if index > -1:
            open_curly, open_curly_line, close_curly, dummy = \
                    find_open_and_close_braces(line_index, index, '{', lines)

            filename = lines[open_curly_line][open_curly+1:close_curly]

            if cur_image == '':
                cur_image = filename
            elif type(cur_image) == list:
                if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list:
                    cur_image[SUB_CAPTION_OR_IMAGE].append(filename)
                else:
                    cur_image[SUB_CAPTION_OR_IMAGE] = [filename]
            else:
                cur_image = ['', [cur_image, filename]]

        '''
        {\input{FILENAME}}
        \caption{CAPTION}

        This input is ambiguous, since input is also used for things like
        inclusion of data from other LaTeX files directly.
        '''
        index = line.find(input_head)
        if index > -1:
            #write_message('found input tag')
            new_tex_names = intelligently_find_filenames(line, TeX=True,
                                                        commas_okay=commas_okay)

            for new_tex_name in new_tex_names:
                if new_tex_name != 'ERROR':
                    #write_message('input TeX: ' + new_tex_name)
                    new_tex_file = get_tex_location(new_tex_name, tex_file)
                    if new_tex_file != None:
                        images_and_captions_and_labels.extend(extract_captions(\
                                                      new_tex_file, sdir,\
                                                      image_list,
                                                      main=False))

        '''PICTURE'''

        index = line.find(picture_head)
        if index > -1:
            # structure of a picture:
            # \begin{picture}
            # ....not worrying about this now
            write_message('found picture tag')

        '''DISPLAYMATH'''

        index = line.find(displaymath_head)
        if index > -1:
            # structure of a displaymath:
            # \begin{displaymath}
            # ....not worrying about this now
            write_message('found displaymath tag')

        '''
        LABELS -
        structure of a label:
        \label{somelabelnamewhichprobablyincludesacolon}

        Labels are used to tag images and will later be used in ref tags
        to reference them.  This is interesting because in effect the refs
        to a plot are additional caption for it.

        Notes: labels can be used for many more things than just plots.
        We'll have to experiment with how to best associate a label with an
        image.. if it's in the caption, it's easy.  If it's in a figure, it's
        still okay... but the images that aren't in figure tags are numerous.
        '''

        index = line.find(label_head)
        if index > -1:
            if in_figure_tag:
                # well then this clearly belongs to the current image
                open_curly, open_curly_line, close_curly, dummy = \
                    find_open_and_close_braces(line_index, index, '{', lines)
                cur_label = lines[open_curly_line][open_curly+1:close_curly]
                cur_label = cur_label.strip()
                label.append(cur_label)
                # write_message('found label ' + cur_label)

        '''
        CAPTIONS -
        structure of a caption:
        \caption[someoptions]{CAPTION}
        or
        \caption{CAPTION}
        or
        \caption{{options}{CAPTION}}
        '''

        index = max([line.find(caption_head), line.find(figcaption_head)])
        if index > -1:
            open_curly, open_curly_line, close_curly, close_curly_line = \
                    find_open_and_close_braces(line_index, index, '{', lines)

            cap_begin = open_curly + 1

            cur_caption = assemble_caption(open_curly_line, cap_begin, \
                        close_curly_line, close_curly, lines)

            if caption == '':
                caption = cur_caption
            elif type(caption) == list:
                if type(caption[SUB_CAPTION_OR_IMAGE]) == list:
                    caption[SUB_CAPTION_OR_IMAGE].append(cur_caption)
                else:
                    caption[SUB_CAPTION_OR_IMAGE] = [cur_caption]
            elif caption != cur_caption:
                caption = ['', [caption, cur_caption]]

        '''
        SUBFLOATS -
        structure of a subfloat (inside of a figure tag):
        \subfloat[CAPTION]{options{FILENAME}}

        also associated with the overall caption of the enclosing figure
        '''

        index = line.find(subfloat_head)
        if index > -1:
            # if we are dealing with subfloats, we need a different
            # sort of structure to keep track of captions and subcaptions
            if type(cur_image) != list:
                cur_image = [cur_image, []]
            if type(caption) != list:
                caption = [caption, []]

            open_square, open_square_line, close_square, close_square_line = \
                    find_open_and_close_braces(line_index, index, '[', lines)
            cap_begin = open_square + 1

            sub_caption = assemble_caption(open_square_line, \
                    cap_begin, close_square_line, close_square, lines)
            caption[SUB_CAPTION_OR_IMAGE].append(sub_caption)

            open_curly, open_curly_line, close_curly, dummy = \
                    find_open_and_close_braces(close_square_line, \
                    close_square, '{', lines)
            sub_image = lines[open_curly_line][open_curly+1:close_curly]

            cur_image[SUB_CAPTION_OR_IMAGE].append(sub_image)

        '''
        SUBFIGURES -
        structure of a subfigure (inside a figure tag):
        \subfigure[CAPTION]{
        \includegraphics[options]{FILENAME}}

        also associated with the overall caption of the enclosing figure
        '''

        index = line.find(subfig_head)
        if index > -1:
            # like with subfloats, we need a different structure for keepin
            # track of this stuff
            if type(cur_image) != list:
                cur_image = [cur_image, []]
            if type(caption) != list:
                caption = [caption, []]

            open_square, open_square_line, close_square, close_square_line = \
                    find_open_and_close_braces(line_index, index, '[', lines)
            cap_begin = open_square + 1

            sub_caption = assemble_caption(open_square_line, \
                    cap_begin, close_square_line, close_square, lines)
            caption[SUB_CAPTION_OR_IMAGE].append(sub_caption)

            index_cpy = index

            # find the graphics tag to get the filename
            # it is okay if we eat lines here
            index = line.find(includegraphics_head)
            while index == -1 and (line_index + 1) < len(lines):
                line_index = line_index + 1
                line = lines[line_index]
                index = line.find(includegraphics_head)

            if line_index == len(lines):
                line_index = index_cpy
                write_message('didn\'t find the image name on line ' +\
                                   line_index)

            else:
                open_curly, open_curly_line, close_curly, dummy = \
                        find_open_and_close_braces(line_index, \
                        index, '{', lines)
                sub_image = lines[open_curly_line][open_curly+1:close_curly]

                cur_image[SUB_CAPTION_OR_IMAGE].append(sub_image)

        '''
        FIGURE

        important: we put the check for the end of the figure at the end
        of the loop in case some pathological person puts everything in one
        line
        '''

        index = max([line.find(figure_tail), line.find(doc_tail)])
        if index > -1:
            in_figure_tag = 0

            cur_image, caption, label, images_and_captions_and_labels =\
                    put_it_together(cur_image, caption, label,\
                                    images_and_captions_and_labels,\
                                    line_index, lines, tex_file)

        '''
        END DOCUMENT

        we shouldn't look at anything after the end document tag is found
        '''

        index = line.find(doc_tail)
        if index > -1:
            break

    return images_and_captions_and_labels
Example #29
0
def process_single(tarball, sdir=CFG_TMPDIR, xtract_text=False, \
                   upload_plots=False, force=False, squash="", \
                   yes_i_know=False, refno_url="", \
                   clean=False):
    """
    Processes one tarball end-to-end.

    @param: tarball (string): the absolute location of the tarball we wish
        to process
    @param: sdir (string): where we should put all the intermediate files for
        the processing.  if you're uploading, this directory should be one
        of the ones specified in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS, else
        the upload won't work
    @param: xtract_text (boolean): true iff you want to run pdftotext on the
        pdf versions of the tarfiles.  this programme assumes that the pdfs
        are named the same as the tarballs but with a .pdf extension.
    @param: upload_plots (boolean): true iff you want to bibupload the plots
        extracted by this process
    @param: force (boolean): force creation of new xml file
    @param: squash: write MARCXML output into a specified 'squash' file
        instead of single files.
    @param: yes_i_know: if True, no user interaction if upload_plots is True
    @param: refno_url: URL to the invenio-instance to query for refno.
    @param: clean: if True, everything except the original tarball, plots and
            context- files will be removed

    @return: marc_name(string): path to generated marcxml file
    """
    sub_dir, refno = get_defaults(tarball, sdir, refno_url)
    if not squash:
        marc_name = os.path.join(sub_dir, '%s.xml' % (refno,))
        if (force or not os.path.exists(marc_name)):
            marc_fd = open(marc_name, 'w')
            marc_fd.write('<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n')
            marc_fd.close()
    else:
        marc_name = squash
    if xtract_text:
        extract_text(tarball)
    try:
        extracted_files_list, image_list, tex_files = untar(tarball, sub_dir)
    except Timeout:
        write_message('Timeout during tarball extraction on %s' % (tarball,))
        return
    if tex_files == [] or tex_files == None:
        write_message('%s is not a tarball' % (os.path.split(tarball)[-1],))
        run_shell_command('rm -r %s', (sub_dir,))
        return

    converted_image_list = convert_images(image_list)
    write_message('converted %d of %d images found for %s' % (len(converted_image_list), \
                                                              len(image_list), \
                                                              os.path.basename(tarball)))
    extracted_image_data = []

    for tex_file in tex_files:
        # Extract images, captions and labels
        partly_extracted_image_data = extract_captions(tex_file, sub_dir, \
                                                converted_image_list)
        if partly_extracted_image_data != []:
            # Add proper filepaths and do various cleaning
            cleaned_image_data = prepare_image_data(partly_extracted_image_data, \
                                                  tex_file, converted_image_list)
            # Using prev. extracted info, get contexts for each image found
            extracted_image_data.extend((extract_context(tex_file, cleaned_image_data)))
    extracted_image_data = remove_dups(extracted_image_data)
    if extracted_image_data == []:
        write_message('No plots detected in %s' % (refno,))
    else:
        if refno_url == "":
            refno = None
        create_contextfiles(extracted_image_data)
        marc_xml = create_MARC(extracted_image_data, tarball, refno)
        if not squash:
            marc_xml += "\n</collection>"
        if marc_name != None:
            marc_fd = open(marc_name, 'a')
            marc_fd.write('%s\n' % (marc_xml,))
            marc_fd.close()
            if not squash:
                write_message('generated %s' % (marc_name,))
                if upload_plots:
                    upload_to_site(marc_name, yes_i_know)
    if clean:
        clean_up(extracted_files_list, image_list)
    write_message('work complete on %s' % (os.path.split(tarball)[-1],))
    return marc_name
Example #30
0
def usage():
    write_message(help_string)
Example #31
0
            clean = True
        elif opt in ['-l', refno_url_param]:
            refno_url = arg
        elif opt in ['-k', refno_param]:
            skip_refno = True
        else:
            usage()
            sys.exit()

    if sdir == None:
        sdir = CFG_TMPDIR
    elif not os.path.isdir(sdir):
        try:
            os.makedirs(sdir)
        except:
            write_message('Error: We can\'t use this sdir.  using ' + \
                      'CFG_TMPDIR')
            sdir = CFG_TMPDIR

    if skip_refno:
        refno_url = ""

    tars_and_gzips = []

    if tarball != None:
        tars_and_gzips.append(tarball)
    if tdir != None:
        filetypes = ['gzip compressed', 'tar archive', 'Tar archive'] # FIXME
        write_message('Currently processing any tarballs in ' + tdir)
        tars_and_gzips.extend(get_list_of_all_matching_files(tdir, filetypes))
    if infile != None:
        tars_and_gzips.extend(parse_and_download(infile, sdir))
def harvest(to_dir, from_date, from_index):
    """
        Calls upon arXiv using URLS as described above in order to grab
        all the tarballs from HEP areas.

        @param: dir (string): the directory where everything that gets
            downloaded will sit
        @param: from_date (int): the date from which we would like to harvest,
            in YYMM format
        @param: from_index (int): the index where we want to begin our harvest
            in YYMM.  i.e. we want to start with the 345th record in 1002.

        @output: TONS OF .tar.gz FILES FROM ARXIV
        @return: (none)
    """

    global current_yearmonth

    if from_date > current_yearmonth and from_date < ARBITRARY_FROM_DATE:
        write_message("Please choose a from date that is not in the future!")
        sys.exit(1)
    if from_date % 100 > 12:
        write_message("Please choose a from date in the form YYMM")
        sys.exit(1)

    if from_date >= ARBITRARY_FROM_DATE or from_date < URL_MOVE:
        for area in HEP_AREAS:

            yearmonthindex = area[BEGIN_YEAR_MONTH_INDEX]

            # nasty casing!
            # I find this particularly horrid because we have to wrap dates..
            # i.e. although 9901 is more than 0001, we might want things in
            # 0001 and not from 9901
            if from_date < current_yearmonth:
                # we want to start in the new century; skip the while below
                yearmonthindex = CENTURY_END
            elif from_date < CENTURY_END:
                yearmonthindex = from_date

            # grab stuff from between 92 and 99
            old_URL_harvest(yearmonthindex, CENTURY_END, to_dir, area)

            yearmonthindex = CENTURY_BEGIN

            # more nasty casing
            if from_date < URL_MOVE:
                # that means we want to start sometime before the weird
                # url change
                yearmonthindex = from_date
            elif from_date > URL_MOVE and from_date < ARBITRARY_FROM_DATE:
                # we don't want to start yet
                yearmonthindex = URL_MOVE

            # grab stuff from between 00 and 07
            old_URL_harvest(yearmonthindex, URL_MOVE, to_dir, area)

    # also after the URL move, there was no distinction between
    # papers from different areas.  hence, outside the for loop

    # even more nasty casing!
    if from_date < current_yearmonth and from_date > URL_MOVE:
        # we want to start someplace after the URL move and before now
        yearmonthindex = from_date
    else:
        yearmonthindex = URL_MOVE

    # grab stuff from between 07 and today
    new_URL_harvest(yearmonthindex, from_index, to_dir)
Example #33
0
def put_it_together(cur_image, caption, label, images_and_captions_and_labels,
                    line_index, lines, tex_file):
    '''
    Takes the current image(s) and caption(s) and label(s) and assembles them
    into something useful in the images_and_captions_and_labels list.

    @param: cur_image (string || list): the image currently being dealt with, or
        the list of images, in the case of subimages
    @param: caption (string || list): the caption or captions currently in scope
    @param: label (list): the labels associated to this image/these images
    @param: images_and_captions_and_labels ([(string, string, list),
        (string, string, list), ...]): a list of tuples of images matched to
        captions and labels from this document.
    @param: line_index (int): the index where we are in the lines (for
        searchback and searchforward purposes)
    @param: lines ([string, string, ...]): the lines in the TeX
    @oaram: tex_file (string): the name of the TeX file we're dealing with

    @return: (cur_image, caption, images_and_captions_labels): the same
        arguments it was sent, processed appropriately
    '''

    if type(cur_image) == list:
        if cur_image[MAIN_CAPTION_OR_IMAGE] == 'ERROR':
            cur_image[MAIN_CAPTION_OR_IMAGE] = ''
        for image in cur_image[SUB_CAPTION_OR_IMAGE]:
            if image == 'ERROR':
                cur_image[SUB_CAPTION_OR_IMAGE].remove(image)

    if cur_image != '' and caption != '':

        if type(cur_image) == list and type(caption) == list:

            if cur_image[MAIN_CAPTION_OR_IMAGE] != '' and\
                    caption[MAIN_CAPTION_OR_IMAGE] != '':
                images_and_captions_and_labels.append(
                    (cur_image[MAIN_CAPTION_OR_IMAGE],
                     caption[MAIN_CAPTION_OR_IMAGE], label))
            if type(cur_image[MAIN_CAPTION_OR_IMAGE]) == list:
                write_message('why is the main image a list?')
                # it's a good idea to attach the main caption to other
                # things, but the main image can only be used once
                cur_image[MAIN_CAPTION_OR_IMAGE] = ''

            if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list:
                if type(caption[SUB_CAPTION_OR_IMAGE]) == list:
                    for index in \
                            range(len(cur_image[SUB_CAPTION_OR_IMAGE])):
                        if index < len(caption[SUB_CAPTION_OR_IMAGE]):
                            long_caption =\
                                caption[MAIN_CAPTION_OR_IMAGE] +' : '+\
                                caption[SUB_CAPTION_OR_IMAGE][index]
                        else:
                            long_caption =\
                                caption[MAIN_CAPTION_OR_IMAGE] +' : '+\
                                'caption not extracted'
                        images_and_captions_and_labels.append(
                            (cur_image[SUB_CAPTION_OR_IMAGE][index],
                             long_caption, label))

                else:
                    long_caption = caption[MAIN_CAPTION_OR_IMAGE] +\
                        ' : ' + caption[SUB_CAPTION_OR_IMAGE]
                    for sub_image in cur_image[SUB_CAPTION_OR_IMAGE]:
                        images_and_captions_and_labels.append(
                            (sub_image, long_caption, label))

            else:
                if type(caption[SUB_CAPTION_OR_IMAGE]) == list:
                    long_caption = caption[MAIN_CAPTION_OR_IMAGE]
                    for sub_cap in caption[SUB_CAPTION_OR_IMAGE]:
                        long_caption = long_caption + ' : ' + sub_cap
                    images_and_captions_and_labels.append(
                       (cur_image[SUB_CAPTION_OR_IMAGE], long_caption, label))
                else:
                    #wtf are they lists for?
                    images_and_captions_and_labels.append(
                        (cur_image[SUB_CAPTION_OR_IMAGE],
                         caption[SUB_CAPTION_OR_IMAGE], label))

        elif type(cur_image) == list:
            if cur_image[MAIN_CAPTION_OR_IMAGE] != '':
                images_and_captions_and_labels.append(
                    (cur_image[MAIN_CAPTION_OR_IMAGE], caption, label))
            if type(cur_image[SUB_CAPTION_OR_IMAGE]) == list:
                for image in cur_image[SUB_CAPTION_OR_IMAGE]:
                   images_and_captions_and_labels.append((image, caption,
                                                          label))
            else:
                images_and_captions_and_labels.append(
                    (cur_image[SUB_CAPTION_OR_IMAGE], caption, label))

        elif type(caption) == list:
            if caption[MAIN_CAPTION_OR_IMAGE] != '':
                images_and_captions_and_labels.append(
                    (cur_image, caption[MAIN_CAPTION_OR_IMAGE], label))
            if type(caption[SUB_CAPTION_OR_IMAGE]) == list:
                write_message('multiple caps for one image: ')
                long_caption = caption[MAIN_CAPTION_OR_IMAGE]
                for subcap in caption[SUB_CAPTION_OR_IMAGE]:
                    long_caption = long_caption + ' : ' + subcap
                write_message(long_caption)
                images_and_captions_and_labels.append((cur_image, long_caption,
                                                       label))
            else:
                images_and_captions_and_labels.append(
                    (cur_image, caption[SUB_CAPTION_OR_IMAGE], label))

        else:
            images_and_captions_and_labels.append((cur_image, caption, label))

    elif cur_image != '' and caption == '':
        # we may have missed the caption somewhere.
        REASONABLE_SEARCHBACK = 25
        REASONABLE_SEARCHFORWARD = 5
        curly_no_tag_preceding = '(?<!\\w){'

        for searchback in range(REASONABLE_SEARCHBACK):
            if line_index - searchback < 0:
                continue

            back_line = lines[line_index - searchback]
            m = re.search(curly_no_tag_preceding, back_line)
            if m != None:
                open_curly = m.start()
                open_curly, open_curly_line, close_curly, \
                close_curly_line = find_open_and_close_braces(\
                line_index - searchback, open_curly, '{', lines)

                cap_begin = open_curly + 1

                caption = assemble_caption(open_curly_line, cap_begin, \
                    close_curly_line, close_curly, lines)

                if type(cur_image) == list:
                    images_and_captions_and_labels.append(
                            (cur_image[MAIN_CAPTION_OR_IMAGE], caption, label))

                    for sub_img in cur_image[SUB_CAPTION_OR_IMAGE]:
                        images_and_captions_and_labels.append((sub_img, caption,
                                                           label))
                else:
                    images_and_captions_and_labels.append((cur_image, caption,
                                                       label))
                    break

        if caption == '':
            for searchforward in range(REASONABLE_SEARCHFORWARD):
                if line_index + searchforward >= len(lines):
                    break

                fwd_line = lines[line_index + searchforward]
                m = re.search(curly_no_tag_preceding, fwd_line)

                if m != None:
                    open_curly = m.start()
                    open_curly, open_curly_line, close_curly, \
                    close_curly_line = find_open_and_close_braces(\
                    line_index + searchforward, open_curly, '{', lines)

                    cap_begin = open_curly + 1

                    caption = assemble_caption(open_curly_line, \
                              cap_begin, close_curly_line, close_curly, lines)

                    if type(cur_image) == list:
                        images_and_captions_and_labels.append(
                            (cur_image[MAIN_CAPTION_OR_IMAGE], caption, label))
                        for sub_img in cur_image[SUB_CAPTION_OR_IMAGE]:
                            images_and_captions_and_labels.append(
                                                (sub_img, caption, label))
                    else:
                        images_and_captions_and_labels.append((cur_image,
                                                           caption, label))
                    break

        if caption == '':
            if type(cur_image) == list:
                images_and_captions_and_labels.append(
                    (cur_image[MAIN_CAPTION_OR_IMAGE], 'No caption found',\
                     label))
                for sub_img in cur_image[SUB_CAPTION_OR_IMAGE]:
                    images_and_captions_and_labels.append((sub_img,
                                                           'No caption', label))
            else:
                images_and_captions_and_labels.append(
                   (cur_image, 'No caption found', label))

    elif caption != '' and cur_image == '':
        if type(caption) == list:
            long_caption = caption[MAIN_CAPTION_OR_IMAGE]
            for subcap in caption[SUB_CAPTION_OR_IMAGE]:
                long_caption = long_caption + ': ' + subcap
        else:
            long_caption = caption

        images_and_captions_and_labels.append(('', long_caption, label))

    # if we're leaving the figure, no sense keeping the data
    cur_image = ''
    caption = ''
    label = []

    return (cur_image, caption, label, images_and_captions_and_labels)
Example #34
0
        elif opt in ["-y", "--" + yes_i_know_param]:
            yes_i_know = True
        elif opt in ["-c", "--" + clean_param]:
            clean = True
        elif opt in ["-l", "--" + refno_url_param]:
            refno_url = arg
        elif opt in ["-k", "--" + refno_param]:
            skip_refno = True
        else:
            usage()
            sys.exit()

    allowed_upload_modes = ("insert", "append", "correct", "replace")
    if not upload_mode in allowed_upload_modes:
        write_message(
            "Specified upload mode %s is not valid. Must be in %s" % (upload_mode, ", ".join(allowed_upload_modes))
        )
        usage()
        sys.exit()

    if sdir == None:
        sdir = CFG_TMPSHAREDDIR
    elif not os.path.isdir(sdir):
        try:
            os.makedirs(sdir)
        except:
            write_message("Error: We can't use this sdir.  using " + "CFG_TMPSHAREDDIR")
            sdir = CFG_TMPSHAREDDIR

    if skip_refno:
        refno_url = ""
def harvest_single(single, to_dir):
    """
    if we only want to harvest one id (arXiv or DESY), we can use this.

    @param: single (string): an id from arXiv or DESY
    @param: to_dir (string): where the output should be saved

    @output: the PDF and source tarball (if applicable) of this single record

    @return: (tarball, pdf): the location of the source tarball and PDF, None
            if not found
    """

    if single.find("arXiv") > -1 and CFG_PLOTEXTRACTOR_ARXIV_BASE == "http://arxiv.org/":
        # good!
        id_str = re.findall("[a-zA-Z\\-]+/\\d+|\\d+\\.\\d+", single)[0]
        idno = id_str.split("/")
        if len(idno) > 0:
            idno = idno[-1]
        yymm = int(idno[:4])
        yymm_dir = make_useful_directories(yymm, to_dir)
        individual_dir = make_single_directory(yymm_dir, "arXiv:" + id_str)
        url_for_file = CFG_PLOTEXTRACTOR_ARXIV_BASE + CFG_PLOTEXTRACTOR_ARXIV_E_PRINT + id_str
        url_for_pdf = CFG_PLOTEXTRACTOR_ARXIV_BASE + CFG_PLOTEXTRACTOR_ARXIV_PDF + id_str
        individual_file = "arXiv:" + id_str.replace("/", "_")
        abs_path = os.path.join(individual_dir, individual_file)
        tarball = abs_path
        pdf = abs_path + ".pdf"
        write_message("download " + url_for_file + " to " + abs_path)
        if not download(url_for_file, individual_file, individual_dir):
            write_message("download of tarball failed")
            tarball = None
        if not download(url_for_pdf, individual_file + ".pdf", individual_dir):
            write_message("download of pdf failed")
            pdf = None
        return (tarball, pdf)

    elif single.find("arXiv") > -1 and CFG_PLOTEXTRACTOR_ARXIV_BASE != "":
        # hmm... is it a filesystem?
        if CFG_PLOTEXTRACTOR_ARXIV_BASE.startswith("/"):
            if not os.path.exists(CFG_PLOTEXTRACTOR_ARXIV_BASE):
                write_message("PROBLEM WITH CFG_PLOTEXTRACTOR_ARXIV_BASE: we cannot " + "find this folder!")
                return (None, None)
            for root, files, dummy in os.walk(CFG_PLOTEXTRACTOR_ARXIV_BASE):
                for file_name in files:
                    id_no = single.replace("arXiv", "")
                    if (
                        file_name.find(id_no) > -1
                        or file_name.find(id_no.replace("/", "_")) > -1
                        or file_name.find(id_no.replace("_", "/")) > -1
                        or file_name.find(id_no.replace(":", "")) > -1
                    ):
                        # that's our file!  probably.
                        return (os.path.join(root, file_name), None)

            # well, no luck there
            return (None, None)

        # okay... is it... a website?
        elif CFG_PLOTEXTRACTOR_ARXIV_BASE.startswith("http"):
            url_for_file = CFG_PLOTEXTRACTOR_ARXIV_BASE + single
            individual_file = os.path.join(to_dir, single)
            download(url_for_file, individual_file, to_dir)
            return (individual_file, None)

        # well, I don't know what to do with it
        else:
            write_message(
                "unsure how to handle CFG_PLOTEXTRACTOR_ARXIV_BASE. "
                + "please fix the harvest_single function in "
                + "miscutil/lib/plotextractor_getter.py"
            )
            return (None, None)

    elif single.find("DESY") > -1:
        # also okay!
        idno = re.findall("\\d{2,4}-\\d{3}", single)[0]
        year, number = idno.split("-")
        if len(year) < 4:
            if int(year) > 92:
                year = "19" + year
            else:
                year = "20" + year
        year_dir = make_single_directory(to_dir, year)
        desy_dir = make_single_directory(year_dir, "DESY")
        individual_dir = make_single_directory(desy_dir, number)
        id_no = year[2:] + "-" + number + ".pdf"
        url_for_file = CFG_PLOTEXTRACTOR_DESY_BASE + year + CFG_PLOTEXTRACTOR_DESY_PIECE + id_no
        individual_file = id_no
        write_message("download " + url_for_file + " to " + os.path.join(individual_dir, individual_file))
        download(url_for_file, individual_file, individual_dir)
        return (None, individual_file)
    write_message("END")
    return (None, None)
Example #36
0
def harvest_single(single, to_dir, selection=("tarball", "pdf")):
    """
    if we only want to harvest one id (arXiv or DESY), we can use this.

    @param: single (string): an id from arXiv or DESY
    @param: to_dir (string): where the output should be saved

    @output: the PDF and source tarball (if applicable) of this single record

    @return: (tarball, pdf): the location of the source tarball and PDF, None
            if not found
    """

    if single.find('arXiv') > -1 and 'arxiv.org' in CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.lower():
        id_str = re.findall('[a-zA-Z\\-]+/\\d+|\\d+\\.\\d+', single)[0]
        idno = id_str.split('/')
        if len(idno) > 0:
            idno = idno[-1]
        yymm = int(idno[:4])
        yymm_dir = make_useful_directories(yymm, to_dir)
        url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \
                       CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       id_str
        url_for_pdf = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \
                      CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                      id_str + '.pdf' # adds '.pdf' to avoid arXiv internal redirect from arXivID to arXivID.pdf
        individual_file = 'arXiv:' + id_str.replace('/', '_')
        individual_dir = make_single_directory(yymm_dir, individual_file)
        abs_path = os.path.join(individual_dir, individual_file)
        tarball = abs_path
        pdf = abs_path + '.pdf'

        try:
            if "tarball" in selection:
                write_message('downloading ' + url_for_file + ' to ' + tarball)
                tarball = download_url(url=url_for_file,
                                       content_type='tar',
                                       download_to_file=tarball)
        except InvenioFileDownloadError:
            tarball = None

        try:
            if "pdf" in selection:
                write_message('downloading ' + url_for_pdf + ' to ' + pdf)
                pdf = download_url(url=url_for_pdf,
                                   content_type="pdf",
                                   download_to_file=pdf)
        except InvenioFileDownloadError:
            pdf = None

        return (tarball, pdf)

    elif single.find('arXiv') > -1 and CFG_PLOTEXTRACTOR_SOURCE_BASE_URL != '':
        # hmm... is it a filesystem?
        if CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith('/'):
            if not os.path.exists(CFG_PLOTEXTRACTOR_SOURCE_BASE_URL):
                write_message('PROBLEM WITH CFG_PLOTEXTRACTOR_SOURCE_BASE_URL: we cannot ' + \
                        'find this folder!')
                return (None, None)
            for root, files, dummy in os.walk(CFG_PLOTEXTRACTOR_SOURCE_BASE_URL):
                for file_name in files:
                    id_no = single.replace('arXiv', '')
                    if file_name.find(id_no) > -1 or\
                       file_name.find(id_no.replace('/', '_')) > -1 or\
                       file_name.find(id_no.replace('_', '/')) > -1 or\
                       file_name.find(id_no.replace(':', '')) > -1:
                        # that's our file!  probably.
                        return (os.path.join(root, file_name), None)

            # well, no luck there
            return (None, None)

        # okay... is it... a website?
        elif CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith('http') and "tarball" in selection:
            url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + single
            individual_file = os.path.join(to_dir, single)
            abs_path = os.path.join(to_dir, individual_file)
            try:
                abs_path = download_url(url=url_for_file,
                                        content_type='tar',
                                        download_to_file=abs_path)
            except InvenioFileDownloadError:
                abs_path = None
            return (abs_path, None)

        # well, I don't know what to do with it
        else:
            write_message('unsure how to handle CFG_PLOTEXTRACTOR_SOURCE_BASE_URL. ' + \
                  'please fix the harvest_single function in ' + \
                  'miscutil/lib/plotextractor_getter.py')
            return (None, None)

    elif single.find('DESY') > -1 and "pdf" in selection:
        # also okay!
        idno = re.findall('\\d{2,4}-\\d{3}', single)[0]
        year, number = idno.split('-')
        if len(year) < 4:
            if int(year) > 92:
                year = '19' + year
            else:
                year = '20' + year
        year_dir = make_single_directory(to_dir, year)
        desy_dir = make_single_directory(year_dir, 'DESY')
        individual_dir = make_single_directory(desy_dir, number)
        id_no = year[2:] + '-' + number + '.pdf'
        url_for_file = CFG_PLOTEXTRACTOR_DESY_BASE + year + \
                       CFG_PLOTEXTRACTOR_DESY_PIECE + id_no
        individual_file = id_no
        abs_path = os.path.join(individual_dir, individual_file)
        write_message('download ' + url_for_file + ' to ' + abs_path)
        try:
            abs_path = download_url(url=url_for_file,
                                    content_type='pdf',
                                    download_to_file=abs_path)
        except InvenioFileDownloadError:
            abs_path = None
        return (None, abs_path)
    write_message('END')
    return (None, None)
def harvest_single(single, to_dir, selection=("tarball", "pdf")):
    """
    if we only want to harvest one id (arXiv or DESY), we can use this.

    @param: single (string): an id from arXiv or DESY
    @param: to_dir (string): where the output should be saved

    @output: the PDF and source tarball (if applicable) of this single record

    @return: (tarball, pdf): the location of the source tarball and PDF, None
            if not found
    """

    if single.find('arXiv') > -1 and \
           CFG_PLOTEXTRACTOR_ARXIV_BASE == 'http://arxiv.org/':
        id_str = re.findall('[a-zA-Z\\-]+/\\d+|\\d+\\.\\d+', single)[0]
        idno = id_str.split('/')
        if len(idno) > 0:
            idno = idno[-1]
        yymm = int(idno[:4])
        yymm_dir = make_useful_directories(yymm, to_dir)
        individual_dir = make_single_directory(yymm_dir, 'arXiv:' + id_str)
        url_for_file = CFG_PLOTEXTRACTOR_ARXIV_BASE + CFG_PLOTEXTRACTOR_ARXIV_E_PRINT + \
                       id_str
        url_for_pdf = CFG_PLOTEXTRACTOR_ARXIV_BASE + CFG_PLOTEXTRACTOR_ARXIV_PDF + \
                      id_str
        individual_file = 'arXiv:' + id_str.replace('/', '_')
        abs_path = os.path.join(individual_dir, individual_file)
        tarball = abs_path
        pdf = abs_path + '.pdf'
        write_message('download ' + url_for_file + ' to ' + abs_path)
        if "tarball" in selection and not download(url_for_file, individual_file, individual_dir):
            write_message('download of tarball failed/skipped')
            tarball = None
        if "pdf" in selection and not download(url_for_pdf, individual_file + '.pdf', individual_dir):
            write_message('download of pdf failed/skipped')
            pdf = None
        return (tarball, pdf)

    elif single.find('arXiv') > -1 and CFG_PLOTEXTRACTOR_ARXIV_BASE != '':
        # hmm... is it a filesystem?
        if CFG_PLOTEXTRACTOR_ARXIV_BASE.startswith('/'):
            if not os.path.exists(CFG_PLOTEXTRACTOR_ARXIV_BASE):
                write_message('PROBLEM WITH CFG_PLOTEXTRACTOR_ARXIV_BASE: we cannot ' + \
                        'find this folder!')
                return (None, None)
            for root, files, dummy in os.walk(CFG_PLOTEXTRACTOR_ARXIV_BASE):
                for file_name in files:
                    id_no = single.replace('arXiv', '')
                    if file_name.find(id_no) > -1 or\
                       file_name.find(id_no.replace('/', '_')) > -1 or\
                       file_name.find(id_no.replace('_', '/')) > -1 or\
                       file_name.find(id_no.replace(':', '')) > -1:
                        # that's our file!  probably.
                        return (os.path.join(root, file_name), None)

            # well, no luck there
            return (None, None)

        # okay... is it... a website?
        elif CFG_PLOTEXTRACTOR_ARXIV_BASE.startswith('http') and "tarball" in selection:
            url_for_file = CFG_PLOTEXTRACTOR_ARXIV_BASE + single
            individual_file = os.path.join(to_dir, single)
            download(url_for_file, individual_file, to_dir)
            return (individual_file, None)

        # well, I don't know what to do with it
        else:
            write_message('unsure how to handle CFG_PLOTEXTRACTOR_ARXIV_BASE. ' + \
                  'please fix the harvest_single function in ' + \
                  'miscutil/lib/plotextractor_getter.py')
            return (None, None)

    elif single.find('DESY') > -1 and "pdf" in selection:
        # also okay!
        idno = re.findall('\\d{2,4}-\\d{3}', single)[0]
        year, number = idno.split('-')
        if len(year) < 4:
            if int(year) > 92:
                year = '19' + year
            else:
                year = '20' + year
        year_dir = make_single_directory(to_dir, year)
        desy_dir = make_single_directory(year_dir, 'DESY')
        individual_dir = make_single_directory(desy_dir, number)
        id_no = year[2:] + '-' + number + '.pdf'
        url_for_file = CFG_PLOTEXTRACTOR_DESY_BASE + year + \
                       CFG_PLOTEXTRACTOR_DESY_PIECE + id_no
        individual_file = id_no
        write_message('download ' + url_for_file + ' to ' + \
                os.path.join(individual_dir, individual_file))
        download(url_for_file, individual_file, individual_dir)
        return (None, individual_file)
    write_message('END')
    return (None, None)
def harvest_single(single, to_dir, selection=("tarball", "pdf")):
    """
    if we only want to harvest one id (arXiv or DESY), we can use this.

    @param: single (string): an id from arXiv or DESY
    @param: to_dir (string): where the output should be saved

    @output: the PDF and source tarball (if applicable) of this single record

    @return: (tarball, pdf): the location of the source tarball and PDF, None
            if not found
    """

    if single.find(
            'arXiv'
    ) > -1 and 'arxiv.org' in CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.lower():
        id_str = re.findall('[a-zA-Z\\-]+/\\d+|\\d+\\.\\d+', single)[0]
        idno = id_str.split('/')
        if len(idno) > 0:
            idno = idno[-1]
        yymm = int(idno[:4])
        yymm_dir = make_useful_directories(yymm, to_dir)
        url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \
                       CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER + \
                       id_str
        url_for_pdf = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + \
                      CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER + \
                      id_str + '.pdf' # adds '.pdf' to avoid arXiv internal redirect from arXivID to arXivID.pdf
        individual_file = 'arXiv:' + id_str.replace('/', '_')
        individual_dir = make_single_directory(yymm_dir, individual_file)
        abs_path = os.path.join(individual_dir, individual_file)
        tarball = abs_path
        pdf = abs_path + '.pdf'

        try:
            if "tarball" in selection:
                write_message('downloading ' + url_for_file + ' to ' + tarball)
                tarball = download_url(url=url_for_file,
                                       content_type='tar',
                                       download_to_file=tarball)
        except InvenioFileDownloadError:
            tarball = None

        try:
            if "pdf" in selection:
                write_message('downloading ' + url_for_pdf + ' to ' + pdf)
                pdf = download_url(url=url_for_pdf,
                                   content_type="pdf",
                                   download_to_file=pdf)
        except InvenioFileDownloadError:
            pdf = None

        return (tarball, pdf)

    elif single.find('arXiv') > -1 and CFG_PLOTEXTRACTOR_SOURCE_BASE_URL != '':
        # hmm... is it a filesystem?
        if CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith('/'):
            if not os.path.exists(CFG_PLOTEXTRACTOR_SOURCE_BASE_URL):
                write_message('PROBLEM WITH CFG_PLOTEXTRACTOR_SOURCE_BASE_URL: we cannot ' + \
                        'find this folder!')
                return (None, None)
            for root, files, dummy in os.walk(
                    CFG_PLOTEXTRACTOR_SOURCE_BASE_URL):
                for file_name in files:
                    id_no = single.replace('arXiv', '')
                    if file_name.find(id_no) > -1 or\
                       file_name.find(id_no.replace('/', '_')) > -1 or\
                       file_name.find(id_no.replace('_', '/')) > -1 or\
                       file_name.find(id_no.replace(':', '')) > -1:
                        # that's our file!  probably.
                        return (os.path.join(root, file_name), None)

            # well, no luck there
            return (None, None)

        # okay... is it... a website?
        elif CFG_PLOTEXTRACTOR_SOURCE_BASE_URL.startswith(
                'http') and "tarball" in selection:
            url_for_file = CFG_PLOTEXTRACTOR_SOURCE_BASE_URL + single
            individual_file = os.path.join(to_dir, single)
            abs_path = os.path.join(to_dir, individual_file)
            try:
                abs_path = download_url(url=url_for_file,
                                        content_type='tar',
                                        download_to_file=abs_path)
            except InvenioFileDownloadError:
                abs_path = None
            return (abs_path, None)

        # well, I don't know what to do with it
        else:
            write_message('unsure how to handle CFG_PLOTEXTRACTOR_SOURCE_BASE_URL. ' + \
                  'please fix the harvest_single function in ' + \
                  'miscutil/lib/plotextractor_getter.py')
            return (None, None)

    elif single.find('DESY') > -1 and "pdf" in selection:
        # also okay!
        idno = re.findall('\\d{2,4}-\\d{3}', single)[0]
        year, number = idno.split('-')
        if len(year) < 4:
            if int(year) > 92:
                year = '19' + year
            else:
                year = '20' + year
        year_dir = make_single_directory(to_dir, year)
        desy_dir = make_single_directory(year_dir, 'DESY')
        individual_dir = make_single_directory(desy_dir, number)
        id_no = year[2:] + '-' + number + '.pdf'
        url_for_file = CFG_PLOTEXTRACTOR_DESY_BASE + year + \
                       CFG_PLOTEXTRACTOR_DESY_PIECE + id_no
        individual_file = id_no
        abs_path = os.path.join(individual_dir, individual_file)
        write_message('download ' + url_for_file + ' to ' + abs_path)
        try:
            abs_path = download_url(url=url_for_file,
                                    content_type='pdf',
                                    download_to_file=abs_path)
        except InvenioFileDownloadError:
            abs_path = None
        return (None, abs_path)
    write_message('END')
    return (None, None)
def untar(tarball, sdir):
    '''
    Here we decide if our file is actually a tarball (sometimes the
    'tarballs' gotten from arXiv aren't actually tarballs.  If they
    'contain' only the TeX file, then they are just that file.), then
    we untar it if so and decide which of its constituents are the
    TeX file and which are the images.

    @param: tarball (string): the name of the tar file from arXiv
    @param: dir (string): the directory where we would like it untarred to

    @return: (image_list, tex_file) (([string, string, ...], string)):
        list of images in the tarball and the name of the TeX file in the
        tarball.
    '''

    tarball = check_for_gzip(tarball, sdir)
    dummy1, cmd_out, cmd_err = run_shell_command('file ' + tarball)

    tarball_output = 'tar archive'
    if re.search(tarball_output, cmd_out) == None:
        run_shell_command('rm ' + tarball)
        return ([], None)

    cmd = 'tar xvf ' + tarball + ' -C ' + sdir
    dummy1, cmd_out, cmd_err = run_shell_command(cmd)

    if cmd_err != '':
        return ([], None)

    run_shell_command('rm ' + tarball)
    cmd_out = cmd_out.split('\n')

    tex_output_contains = 'TeX'

    tex_file_extension = 'tex'
    image_output_contains = 'image'
    eps_output_contains = '- type eps'
    ps_output_contains = 'Postscript'

    image_list = []
    might_be_tex = []

    for extracted_file in cmd_out:
        if extracted_file == '':
            break

        # ensure we are actually looking at the right file
        extracted_file = os.path.join(sdir, extracted_file)

        dummy1, cmd_out, dummy2 = run_shell_command('file ' + extracted_file)

        # is it TeX?
        if cmd_out.find(tex_output_contains) > -1:
            might_be_tex.append(extracted_file)

        # is it an image?
        elif cmd_out.lower().find(image_output_contains) > cmd_out.find(':') \
                or \
                cmd_out.lower().find(eps_output_contains) > cmd_out.find(':')\
                or \
                cmd_out.find(ps_output_contains) > cmd_out.find(':'):
            # we have "image" in the output, and it is not in the filename
            # i.e. filename.ext: blah blah image blah blah
            image_list.append(extracted_file)

        # if neither, maybe it is TeX or an image anyway, otherwise,
        # we don't care
        else:
            if extracted_file.split('.')[-1] == tex_file_extension:
                # we might have tex source!
                might_be_tex.append(extracted_file)
            else:
                if extracted_file.split('.')[-1] in ['eps', 'png',\
                    'ps', 'jpg']:
                    # we might have an image!
                    image_list.append(extracted_file)
                #else:
                    # we don't care about it
                    #run_shell_command('rm ' + extracted_file)

    if might_be_tex == []:
        # well, that's tragic
        write_message('could not find TeX file in tar archive')
        return ([], [])

    return (image_list, might_be_tex)
Example #40
0
        elif opt in ['-q', squash_param]:
            squash = True
        elif opt in ['-y', yes_i_know_param]:
            yes_i_know = True
        else:
            usage()
            sys.exit()

    if sdir == None:
        sdir = CFG_TMPDIR

    if not os.path.isdir(sdir):
        try:
            os.makedirs(sdir)
        except:
            write_message('oopsie, we can\'t use this sdir.  using '+\
                      'CFG_TMPDIR')
            sdir = CFG_TMPDIR

    tars_and_gzips = []

    if tarball != None:
        tars_and_gzips.append(tarball)
    if tdir != None:
        filetypes = ['gzip compressed', 'ar archive'] # that catches [t,T]ar # FIXME
        write_message('processing any tarballs in ' + tdir)
        tars_and_gzips.extend(get_list_of_all_matching_files(tdir, filetypes))
    if infile != None:
        tars_and_gzips.extend(parse_and_download(infile, sdir))
    if recids != None:
        tars_and_gzips.extend(tarballs_by_recids(recids, sdir))
    if arXiv != None: