Ejemplo n.º 1
0
def rotate_image(filename, line, sdir, image_list):
    """
    Given a filename and a line, figure out what it is that the author
    wanted to do wrt changing the rotation of the image and convert the
    file so that this rotation is reflected in its presentation.

    @param: filename (string): the name of the file as specified in the TeX
    @param: line (string): the line where the rotate command was found

    @output: the image file rotated in accordance with the rotate command
    @return: True if something was rotated
    """

    file_loc = get_image_location(filename, sdir, image_list)
    degrees = re.findall('(angle=[-\\d]+|rotate=[-\\d]+)', line)

    if len(degrees) < 1:
        return False

    degrees = degrees[0].split('=')[-1].strip()

    if file_loc == None or file_loc == 'ERROR' or\
            not re.match('-*\\d+', degrees):
        return False

    degrees = str(0 - int(degrees))
    cmd_list = ['mogrify', '-rotate', degrees, file_loc]
    dummy, dummy, cmd_err = run_process_with_timeout(cmd_list)
    if cmd_err != '':
        return True
    else:
        return True
Ejemplo n.º 2
0
 def test_run_cmd_timeout_big_stdout(self):
     """shellutils - running simple command with a big standard output"""
     from invenio.config import CFG_PYLIBDIR
     test_file = os.path.join(CFG_PYLIBDIR, 'invenio', 'bibcirculation_templates.py')
     exitstatus, stdout, stderr = run_process_with_timeout(['cat', test_file], timeout=10)
     self.assertEqual(open(test_file).read(), stdout)
     self.assertEqual(exitstatus, 0)
Ejemplo n.º 3
0
 def test_run_cmd_timeout_no_timeout(self):
     """shellutils - running simple command without expiring timeout"""
     exitstatus, stdout, stderr = run_process_with_timeout(
         [self.script_path, '5'], timeout=10)
     self.failUnless('foo' in stdout)
     self.failUnless('bar' in stderr)
     self.assertEqual(exitstatus, 0)
def execute_command_with_stderr(*args, **argd):
    """Wrapper to run_process_with_timeout."""
    debug("Executing: %s" % (args, ))
    res, stdout, stderr = run_process_with_timeout(args, cwd=argd.get('cwd'), filename_out=argd.get('filename_out'))
    if res != 0:
        error("Error when executing %s" % (args, ))
        raise InvenioWebSubmitFileConverterError("Error in running %s\n stdout:\n%s\nstderr:\n%s\n" % (args, stdout, stderr))
    return stdout, stderr
def execute_command_with_stderr(*args, **argd):
    """Wrapper to run_process_with_timeout."""
    debug("Executing: %s" % (args, ))
    res, stdout, stderr = run_process_with_timeout(args, cwd=argd.get('cwd'), filename_out=argd.get('filename_out'))
    if res != 0:
        error("Error when executing %s" % (args, ))
        raise InvenioWebSubmitFileConverterError("Error in running %s\n stdout:\n%s\nstderr:\n%s\n" % (args, stdout, stderr))
    return stdout, stderr
Ejemplo n.º 6
0
 def test_run_cmd_timeout_pgid(self):
     """shellutils - running simple command should have PID == PGID"""
     exitstatus, stdout, stderr = run_process_with_timeout(
         [self.python_script_path, '5'])
     self.failIf(
         'PID != PGID' in stdout,
         'PID != PGID was found in current output: %s (%s)' %
         (stdout, stderr))
     self.failUnless(
         'PID == PGID' in stdout,
         'PID == PGID wasn\'t found in current output: %s (%s)' %
         (stdout, stderr))
Ejemplo n.º 7
0
def extract_text(tarball):
    """
    We check to see if there's a file called tarball.pdf, and, if there is,
    we run pdftotext on it.  Simple as that.

    @param: tarball (string): the raw name of the tarball

    @return: None
    """
    try:
        os.stat(tarball + '.pdf')
        cmd_list = ['pdftotext', tarball + '.pdf ', tarball + '.txt']
        dummy1, dummy2, cmd_err = run_process_with_timeout(cmd_list)
        if cmd_err != '':
            return -1
        write_message('generated ' + tarball + '.txt from ' + tarball + '.pdf')
    except:
        write_message('no text from ' + tarball + '.pdf')
Ejemplo n.º 8
0
def extract_text(tarball):
    """
    We check to see if there's a file called tarball.pdf, and, if there is,
    we run pdftotext on it.  Simple as that.

    @param: tarball (string): the raw name of the tarball

    @return: None
    """
    try:
        os.stat(tarball + '.pdf')
        dummy1, dummy2, cmd_err = run_process_with_timeout('pdftotext %s %s' % \
                                     (tarball + '.pdf ', tarball + '.txt'), shell = True)
        if cmd_err != '':
            return - 1
        write_message('generated ' + tarball + '.txt from ' + tarball + '.pdf')
    except:
        write_message('no text from ' + tarball + '.pdf')
def convert_images(image_list):
    """
    Here we figure out the types of the images that were extracted from
    the tarball and determine how to convert them into PNG.

    @param: image_list ([string, string, ...]): the list of image files
        extracted from the tarball in step 1

    @return: image_list ([str, str, ...]): The list of image files when all
        have been converted to PNG format.
    """
    png_output_contains = 'PNG image'
    ret_list = []
    for image_file in image_list:
        if os.path.isdir(image_file):
            continue

        # FIXME: here and everywhere else in the plot extractor
        # library the run shell command statements should be (1)
        # called with timeout in order to prevent runaway imagemagick
        # conversions; (2) the arguments should be passed properly so
        # that they are escaped.

        dummy1, cmd_out, dummy2 = run_shell_command('file %s', (image_file,))
        if cmd_out.find(png_output_contains) > -1:
            ret_list.append(image_file)
        else:
            # we're just going to assume that ImageMagick can convert all
            # the image types that we may be faced with
            # for sure it can do EPS->PNG and JPG->PNG and PS->PNG
            # and PSTEX->PNG
            converted_image_file = get_converted_image_name(image_file)
            try:
                dummy1, cmd_out, cmd_err = run_process_with_timeout('convert %s %s'\
                                                   % (image_file, \
                                                      converted_image_file), shell = True)
                if cmd_err == '':
                    ret_list.append(converted_image_file)
                else:
                    write_message('convert failed on ' + image_file)
            except Timeout:
                write_message('convert timed out on ' + image_file)

    return ret_list
Ejemplo n.º 10
0
 def test_run_cmd_viasudo_no_password(self):
     """shellutils - running simple command via sudo should not wait for password"""
     exitstatus, stdout, stderr = run_process_with_timeout([self.script_path, '5'], timeout=10, sudo='foo')
     self.assertNotEqual(exitstatus, 0)
Ejemplo n.º 11
0
 def test_run_cmd_timeout_pgid(self):
     """shellutils - running simple command should have PID == PGID"""
     exitstatus, stdout, stderr = run_process_with_timeout([self.python_script_path, '5'])
     self.failIf('PID != PGID' in stdout, 'PID != PGID was found in current output: %s (%s)' % (stdout, stderr))
     self.failUnless('PID == PGID' in stdout, 'PID == PGID wasn\'t found in current output: %s (%s)' % (stdout, stderr))
Ejemplo n.º 12
0
 def test_run_cmd_timeout_no_timeout(self):
     """shellutils - running simple command without expiring timeout"""
     exitstatus, stdout, stderr = run_process_with_timeout([self.script_path, '5'], timeout=10)
     self.failUnless('foo' in stdout)
     self.failUnless('bar' in stderr)
     self.assertEqual(exitstatus, 0)
Ejemplo n.º 13
0
def untar(original_tarball, sdir):
    """
    Here we decide if our file is actually a tarball (sometimes the
    'tarballs' gotten from arXiv aren't actually tarballs.  If they
    'contain' only the TeX file, then they are just that file.), then
    we untar it if so and decide which of its constituents are the
    TeX file and which are the images.

    @param: tarball (string): the name of the tar file from arXiv
    @param: dir (string): the directory where we would like it untarred to

    @return: (image_list, tex_file) (([string, string, ...], string)):
        list of images in the tarball and the name of the TeX file in the
        tarball.
    """

    tarball = check_for_gzip(original_tarball)
    dummy1, cmd_out, cmd_err = run_shell_command('file %s', (tarball, ))
    tarball_output = 'tar archive'
    if re.search(tarball_output, cmd_out) == None:
        run_shell_command('rm %s', (tarball, ))
        return ([], [], None)
    cmd_list = ['tar', 'xvf', tarball, '-C', sdir]
    dummy1, cmd_out, cmd_err = run_process_with_timeout(cmd_list)

    if cmd_err != '':
        return ([], [], None)
    if original_tarball != tarball:
        run_shell_command('rm %s', (tarball, ))
    cmd_out = cmd_out.split('\n')

    tex_output_contains = 'TeX'

    tex_file_extension = 'tex'
    image_output_contains = 'image'
    eps_output_contains = '- type eps'
    ps_output_contains = 'Postscript'

    file_list = []
    image_list = []
    might_be_tex = []

    for extracted_file in cmd_out:
        if extracted_file == '':
            break
        if extracted_file.startswith('./'):
            extracted_file = extracted_file[2:]
        # ensure we are actually looking at the right file
        extracted_file = os.path.join(sdir, extracted_file)

        # Add to full list of extracted files
        file_list.append(extracted_file)

        dummy1, cmd_out, dummy2 = run_shell_command('file %s',
                                                    (extracted_file, ))

        # is it TeX?
        if cmd_out.find(tex_output_contains) > -1:
            might_be_tex.append(extracted_file)

        # is it an image?
        elif cmd_out.lower().find(image_output_contains) > cmd_out.find(':') \
                or \
                cmd_out.lower().find(eps_output_contains) > cmd_out.find(':')\
                or \
                cmd_out.find(ps_output_contains) > cmd_out.find(':'):
            # we have "image" in the output, and it is not in the filename
            # i.e. filename.ext: blah blah image blah blah
            image_list.append(extracted_file)

        # if neither, maybe it is TeX or an image anyway, otherwise,
        # we don't care
        else:
            if extracted_file.split('.')[-1].lower() == tex_file_extension:
                # we might have tex source!
                might_be_tex.append(extracted_file)
            elif extracted_file.split('.')[-1] in ['eps', 'png', \
                    'ps', 'jpg', 'pdf']:
                # we might have an image!
                image_list.append(extracted_file)

    if might_be_tex == []:
        # well, that's tragic
        # could not find TeX file in tar archive
        return ([], [], [])

    return (file_list, image_list, might_be_tex)
Ejemplo n.º 14
0
 def test_run_cmd_no_timeout(self):
     """shellutils - running simple command with non expiring timeout"""
     t1 = time.time()
     self.assertEqual(3, len(run_process_with_timeout((self.script_path, '5'), timeout=15)[1].split('\n')))
     self.failUnless(time.time() - t1 < 7)
Ejemplo n.º 15
0
def untar(original_tarball, sdir):
    """
    Here we decide if our file is actually a tarball (sometimes the
    'tarballs' gotten from arXiv aren't actually tarballs.  If they
    'contain' only the TeX file, then they are just that file.), then
    we untar it if so and decide which of its constituents are the
    TeX file and which are the images.

    @param: tarball (string): the name of the tar file from arXiv
    @param: dir (string): the directory where we would like it untarred to

    @return: (image_list, tex_file) (([string, string, ...], string)):
        list of images in the tarball and the name of the TeX file in the
        tarball.
    """

    tarball = check_for_gzip(original_tarball)
    dummy1, cmd_out, cmd_err = run_shell_command('file %s', (tarball,))
    tarball_output = 'tar archive'
    if re.search(tarball_output, cmd_out) == None:
        run_shell_command('rm %s', (tarball,))
        return ([], [], None)
    dummy1, cmd_out, cmd_err = run_process_with_timeout('tar xvf %s -C %s' %
                                                        (tarball, sdir), shell = True)

    if cmd_err != '':
        return ([], [], None)
    if original_tarball != tarball:
        run_shell_command('rm %s', (tarball,))
    cmd_out = cmd_out.split('\n')

    tex_output_contains = 'TeX'

    tex_file_extension = 'tex'
    image_output_contains = 'image'
    eps_output_contains = '- type eps'
    ps_output_contains = 'Postscript'

    file_list = []
    image_list = []
    might_be_tex = []

    for extracted_file in cmd_out:
        if extracted_file == '':
            break
        if extracted_file.startswith('./'):
            extracted_file = extracted_file[2:]
        # ensure we are actually looking at the right file
        extracted_file = os.path.join(sdir, extracted_file)

        # Add to full list of extracted files
        file_list.append(extracted_file)

        dummy1, cmd_out, dummy2 = run_shell_command('file %s', (extracted_file,))

        # is it TeX?
        if cmd_out.find(tex_output_contains) > -1:
            might_be_tex.append(extracted_file)

        # is it an image?
        elif cmd_out.lower().find(image_output_contains) > cmd_out.find(':') \
                or \
                cmd_out.lower().find(eps_output_contains) > cmd_out.find(':')\
                or \
                cmd_out.find(ps_output_contains) > cmd_out.find(':'):
            # we have "image" in the output, and it is not in the filename
            # i.e. filename.ext: blah blah image blah blah
            image_list.append(extracted_file)

        # if neither, maybe it is TeX or an image anyway, otherwise,
        # we don't care
        else:
            if extracted_file.split('.')[-1] == tex_file_extension:
                # we might have tex source!
                might_be_tex.append(extracted_file)
            elif extracted_file.split('.')[-1] in ['eps', 'png', \
                    'ps', 'jpg', 'pdf']:
                # we might have an image!
                image_list.append(extracted_file)

    if might_be_tex == []:
        # well, that's tragic
        # could not find TeX file in tar archive
        return ([], [], [])

    return (file_list, image_list, might_be_tex)