Ejemplo n.º 1
0
def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]
    try:
        try:
            opts, args = getopt.getopt(
                argv, "ht",
                ["help", "test", "noxml", "highlight", "title", "author"])
        except getopt.error as msg:
            raise UsageError(msg)
        for o, a in opts:
            if (o in ['-h', '--help']):
                # print help and exit
                sys.stdout.write(__doc__)
                sys.stdout.flush()
                return 0

        pdf2heads(opts, args)

    except UsageError as err:
        print >> sys.stderr, err.msg
        print >> sys.stderr, "for help use --help"
        return 2
    except ConfigError, err:
        sys.stderr.writelines([str(err.msg), '\n'])
        sys.stderr.flush()
        return 1
def main(argv=None):
    global api_key
    if argv is None:
        argv = sys.argv[1:]
    try:
        if api_key == "XXX":
            raise ConfigError(
                "You need to register for an Open Calaise API key and configure it in config.py\n"
            )
        try:
            opts, args = getopt.getopt(argv, "h", ["help"])
        except getopt.error as msg:
            raise UsageError(msg)
        for o, a in opts:
            if (o in ['-h', '--help']):
                # print help and exit
                sys.stdout.write(__doc__)
                sys.stdout.flush()
                return 0

        opencalaistags(opts, args)

    except UsageError, err:
        sys.stderr.writelines([str(err.msg) + '\n', "for help use --help\n"])
        sys.stderr.flush()
        return 2
Ejemplo n.º 3
0
def pdf2etree(argv=None):
    """Convert a PDF to XML then parse to an LXML etree and return."""
    if argv is None:
        argv = sys.argv[1:]
    try:
        opts, args = getopt.getopt(argv, "h", ["help"])
    except getopt.error, msg:
        raise UsageError(msg)
Ejemplo n.º 4
0
def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]
    try:
        try:
            opts, args = getopt.getopt(argv, "ht", ["help", "test", "noxml", "highlight"])
        except getopt.error, msg:
            raise UsageError(msg)
        for o, a in opts:
            if (o in ['-h', '--help']):
                # print help and exit
                sys.stdout.write(__doc__)
                sys.stdout.flush()
                return 0
            elif (o in ['-t', '--test']):
                # do unit test
                import doctest
                doctest.testmod()
                return 0
        pdf2refs(opts, args)
Ejemplo n.º 5
0
def main(argv=None):
    global Found_abstract
    global Found_Sammanfattning
    global automatic_rerunning

    if argv is None:
        argv = sys.argv[1:]
    try:
        try:
            opts, args = getopt.getopt(argv, "ht", [
                "help", "test", "noxml", "highlight", "title", "author",
                "verbose", "caps"
            ])
        except getopt.error as msg:
            raise UsageError(msg)
        for o, a in opts:
            if (o in ['-h', '--help']):
                # print help and exit
                sys.stdout.write(__doc__)
                sys.stdout.flush()
                return 0

        pdf2heads(opts, args)

        if not Found_abstract:
            print "Automatically running the program again with the option --caps"
            automatic_rerunning = True
            pdf2heads(opts, args)

    except UsageError as err:
        print >> sys.stderr, err.msg
        print >> sys.stderr, "for help use --help"
        return 2
    except ConfigError, err:
        sys.stderr.writelines([str(err.msg), '\n'])
        sys.stderr.flush()
        return 1
Ejemplo n.º 6
0
def pdf2etree(argv=None):
    """Convert a PDF to XML then parse to an LXML etree and return."""
    if argv is None:
        argv = sys.argv[1:]
    try:
        opts, args = getopt.getopt(argv, "h", ["help"])
    except getopt.error, msg:
        raise UsageError(msg)
    for o, a in opts:
        if (o in ['-h', '--help']):
            print __doc__
            return 0
    try:
        pdfpath = args[-1]
    except IndexError:
        raise UsageError("You must provide the name of a valid PDF to analyse")

    pdffn = os.path.split(pdfpath)[-1]
    tmpdir = tempfile.mkdtemp(suffix='.d', prefix=pdffn)
    tmppath = os.path.join(tmpdir, "{0}.xml".format(pdffn))
    if not os.path.exists(pdf2xmlexe):
        raise ConfigError(
            "pdftoxml exectutable does not exist at specified path: '{0}'\nPlease check config.py"
            .format(pdf2xmlexe))
    cmdline = "{0} -q -blocks {1} {2}".format(pdf2xmlexe, pdfpath, tmppath)
    commands.getoutput(cmdline)
    try:
        with open(tmppath, 'r') as fh:
            tree = etree.parse(fh)
    except IOError:
        raise UsageError(
Ejemplo n.º 7
0
def main(argv=None):
    global Found_abstract
    global Found_Sammanfattning
    global automatic_rerunning
    global Found_Introduction
    global directiory
    global test_flag
    global author_1
    global author_2
    global json_content
    author_1 = ""
    author_2 = ""
    test_flag = False

    if argv is None:
        #argv=flag
        argv = sys.argv[1:]

    try:
        try:
            #opts pdf address
            opts, args = getopt.getopt(argv, "ht", [
                "help", "test", "unittest", "noxml", "highlight", "title",
                "author", "verbose", "caps"
            ])
        except getopt.error as msg:
            raise UsageError(msg)
        for o, a in opts:
            if (o in ['-h', '--help']):
                # print help and exit
                sys.stdout.write(__doc__)
                sys.stdout.flush()
                return 0
            if (o in ['--unittest']):
                test_flag = True

            #pdf2heads has ability to update:
            #global automatic_rerunning
            #global Found_abstract
            #global Found_Sammanfattning

    # print args[0]

    #download module(expierment - might migarte into cavnas module later)
    #reference: http://www.pythonforbeginners.com/python-on-the-web/how-to-use-urllib2-in-python/
    # file = urllib2.urlopen(urllib2.Request(args[0])).geturl()
    # #r = requests.get(args[0], allow_redirects=True)
    # print (file)
    #
    #
    # pdffile='analyze.pdf'
    #
    # output = open(pdffile, 'wb')
    #
    # output.write(file.read())
    #
    # output.close()
    #
        if not os.path.exists('../../../../output/parse_result/'):
            os.makedirs('../../../../output/parse_result/')
        directiory = "cache"
        pdffile = args[0]

        source_dir = '../../../../output/parse_result/' + directiory

        if not os.path.exists(source_dir):
            os.makedirs(source_dir)
        else:
            shutil.rmtree(source_dir)
            os.makedirs(source_dir)

        pdf2heads(opts, [pdffile], args[1])

        source = os.listdir(source_dir + "/")
        pdffile_name = pdffile.split(".")

        # output = io.open(source_dir + "/" + "author_1.txt", 'r', encoding="utf-8")
        # author_1=output.read()
        # list = author_1.split(">")
        # author_1 = list[1]
        # list = author_1.split("<")
        # author_1 = list[0]
        # output.close()

        if 'author_1' not in json_content:
            print_log("Error! no author exists")
        else:
            content = json_content["author_1"]
            author_1 = content
        if 'author_2' in json_content:
            content = json_content["author_2"]
            author_2 = content

        # output = io.open(source_dir + "/" + "author_2.txt", 'r', encoding="utf-8")
        # author_2 = output.read()
        # list = author_2.split(">")
        # author_2 = list[1]
        # list = author_2.split("<")
        # author_2 = list[0]
        # output.close()

        global my_id
        my_id = ""
        my_id = uuid.uuid1()
        out_dir_name = author.lower() + "_" + str(my_id) + "_" + str(
            datetime.now())
        destination = "../../../../output/parse_result/" + out_dir_name + "/"
        found_folder = ""
        file_list = os.listdir("../../../../output/parse_result/")
        for folder in file_list:
            if folder != "cache" and folder != "log.txt" and folder != "test_log.txt":
                print folder
                folder_name = str(folder).split("_")
                print folder_name
                print "author_1: " + author_1
                if len(author_2):
                    print "author_2: " + (author_2)
                if folder_name[1].lower() == author_1.lower(
                ) or folder_name[2].lower() == author_1.lower(
                ) or folder_name[1].lower() == author_2.lower(
                ) or folder_name[2].lower() == author_2.lower():
                    my_id = folder_name[3]
                    print("session_id:" + my_id)
                    out_dir_name = author.lower() + "_" + str(
                        my_id) + "_" + str(datetime.now())
                    destination = "../../../../output/parse_result/" + out_dir_name + "/"
                    rename_folder = "../../../../output/parse_result/" + folder + "/"
                    os.rename(rename_folder, destination)
                    break

        destination_test = "../../unit_testing/actual_result/" + pdffile_name[
            0] + "/"
        if test_flag == True:
            destination = destination_test
            print_log(
                "unittest flag triggered. destination for output become:")
            print_log(destination_test)

        if not os.path.exists(destination):
            os.makedirs(destination)

        json_content_output = {}
        if os.path.isfile(destination + "/output.json"):
            #update
            with open(destination + "output.json") as f:
                json_content_output = json.load(f)
                for key in json_content_output:
                    if key not in json_content:
                        json_append(key, json_content_output[key])
                f.close()

            os.remove(destination + "/output.json")

        print_json(destination)

        #Unit test
        if os.path.isfile(destination + "/output.json") == False:
            print_log("Error Report!\n")
            print_log("output folder " + destination +
                      "is empty. output.json is not properly output!")

        return out_dir_name
        #if not Found_abstract:
        #   print "Automatically running the program again with the option --caps"
        #  automatic_rerunning=True
        #  pdf2heads(opts, [args[0]],args[1])

    except UsageError as err:
        print >> sys.stderr, err.msg
        print >> sys.stderr, "for help use --help"
        return 2
    except ConfigError, err:
        sys.stderr.writelines([str(err.msg), '\n'])
        sys.stderr.flush()
        return 1