def main(argv=None): if argv is None: argv = sys.argv[1:] try: try: opts, args = getopt.getopt( argv, "ht", ["help", "test", "noxml", "highlight", "title", "author"]) except getopt.error as msg: raise UsageError(msg) for o, a in opts: if (o in ['-h', '--help']): # print help and exit sys.stdout.write(__doc__) sys.stdout.flush() return 0 pdf2heads(opts, args) except UsageError as err: print >> sys.stderr, err.msg print >> sys.stderr, "for help use --help" return 2 except ConfigError, err: sys.stderr.writelines([str(err.msg), '\n']) sys.stderr.flush() return 1
def main(argv=None): global api_key if argv is None: argv = sys.argv[1:] try: if api_key == "XXX": raise ConfigError( "You need to register for an Open Calaise API key and configure it in config.py\n" ) try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.error as msg: raise UsageError(msg) for o, a in opts: if (o in ['-h', '--help']): # print help and exit sys.stdout.write(__doc__) sys.stdout.flush() return 0 opencalaistags(opts, args) except UsageError, err: sys.stderr.writelines([str(err.msg) + '\n', "for help use --help\n"]) sys.stderr.flush() return 2
def pdf2etree(argv=None): """Convert a PDF to XML then parse to an LXML etree and return.""" if argv is None: argv = sys.argv[1:] try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.error, msg: raise UsageError(msg)
def main(argv=None): if argv is None: argv = sys.argv[1:] try: try: opts, args = getopt.getopt(argv, "ht", ["help", "test", "noxml", "highlight"]) except getopt.error, msg: raise UsageError(msg) for o, a in opts: if (o in ['-h', '--help']): # print help and exit sys.stdout.write(__doc__) sys.stdout.flush() return 0 elif (o in ['-t', '--test']): # do unit test import doctest doctest.testmod() return 0 pdf2refs(opts, args)
def main(argv=None): global Found_abstract global Found_Sammanfattning global automatic_rerunning if argv is None: argv = sys.argv[1:] try: try: opts, args = getopt.getopt(argv, "ht", [ "help", "test", "noxml", "highlight", "title", "author", "verbose", "caps" ]) except getopt.error as msg: raise UsageError(msg) for o, a in opts: if (o in ['-h', '--help']): # print help and exit sys.stdout.write(__doc__) sys.stdout.flush() return 0 pdf2heads(opts, args) if not Found_abstract: print "Automatically running the program again with the option --caps" automatic_rerunning = True pdf2heads(opts, args) except UsageError as err: print >> sys.stderr, err.msg print >> sys.stderr, "for help use --help" return 2 except ConfigError, err: sys.stderr.writelines([str(err.msg), '\n']) sys.stderr.flush() return 1
def pdf2etree(argv=None): """Convert a PDF to XML then parse to an LXML etree and return.""" if argv is None: argv = sys.argv[1:] try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.error, msg: raise UsageError(msg) for o, a in opts: if (o in ['-h', '--help']): print __doc__ return 0 try: pdfpath = args[-1] except IndexError: raise UsageError("You must provide the name of a valid PDF to analyse") pdffn = os.path.split(pdfpath)[-1] tmpdir = tempfile.mkdtemp(suffix='.d', prefix=pdffn) tmppath = os.path.join(tmpdir, "{0}.xml".format(pdffn)) if not os.path.exists(pdf2xmlexe): raise ConfigError( "pdftoxml exectutable does not exist at specified path: '{0}'\nPlease check config.py" .format(pdf2xmlexe)) cmdline = "{0} -q -blocks {1} {2}".format(pdf2xmlexe, pdfpath, tmppath) commands.getoutput(cmdline) try: with open(tmppath, 'r') as fh: tree = etree.parse(fh) except IOError: raise UsageError(
def main(argv=None): global Found_abstract global Found_Sammanfattning global automatic_rerunning global Found_Introduction global directiory global test_flag global author_1 global author_2 global json_content author_1 = "" author_2 = "" test_flag = False if argv is None: #argv=flag argv = sys.argv[1:] try: try: #opts pdf address opts, args = getopt.getopt(argv, "ht", [ "help", "test", "unittest", "noxml", "highlight", "title", "author", "verbose", "caps" ]) except getopt.error as msg: raise UsageError(msg) for o, a in opts: if (o in ['-h', '--help']): # print help and exit sys.stdout.write(__doc__) sys.stdout.flush() return 0 if (o in ['--unittest']): test_flag = True #pdf2heads has ability to update: #global automatic_rerunning #global Found_abstract #global Found_Sammanfattning # print args[0] #download module(expierment - might migarte into cavnas module later) #reference: http://www.pythonforbeginners.com/python-on-the-web/how-to-use-urllib2-in-python/ # file = urllib2.urlopen(urllib2.Request(args[0])).geturl() # #r = requests.get(args[0], allow_redirects=True) # print (file) # # # pdffile='analyze.pdf' # # output = open(pdffile, 'wb') # # output.write(file.read()) # # output.close() # if not os.path.exists('../../../../output/parse_result/'): os.makedirs('../../../../output/parse_result/') directiory = "cache" pdffile = args[0] source_dir = '../../../../output/parse_result/' + directiory if not os.path.exists(source_dir): os.makedirs(source_dir) else: shutil.rmtree(source_dir) os.makedirs(source_dir) pdf2heads(opts, [pdffile], args[1]) source = os.listdir(source_dir + "/") pdffile_name = pdffile.split(".") # output = io.open(source_dir + "/" + "author_1.txt", 'r', encoding="utf-8") # author_1=output.read() # list = author_1.split(">") # author_1 = list[1] # list = author_1.split("<") # author_1 = list[0] # output.close() if 'author_1' not in json_content: print_log("Error! no author exists") else: content = json_content["author_1"] author_1 = content if 'author_2' in json_content: content = json_content["author_2"] author_2 = content # output = io.open(source_dir + "/" + "author_2.txt", 'r', encoding="utf-8") # author_2 = output.read() # list = author_2.split(">") # author_2 = list[1] # list = author_2.split("<") # author_2 = list[0] # output.close() global my_id my_id = "" my_id = uuid.uuid1() out_dir_name = author.lower() + "_" + str(my_id) + "_" + str( datetime.now()) destination = "../../../../output/parse_result/" + out_dir_name + "/" found_folder = "" file_list = os.listdir("../../../../output/parse_result/") for folder in file_list: if folder != "cache" and folder != "log.txt" and folder != "test_log.txt": print folder folder_name = str(folder).split("_") print folder_name print "author_1: " + author_1 if len(author_2): print "author_2: " + (author_2) if folder_name[1].lower() == author_1.lower( ) or folder_name[2].lower() == author_1.lower( ) or folder_name[1].lower() == author_2.lower( ) or folder_name[2].lower() == author_2.lower(): my_id = folder_name[3] print("session_id:" + my_id) out_dir_name = author.lower() + "_" + str( my_id) + "_" + str(datetime.now()) destination = "../../../../output/parse_result/" + out_dir_name + "/" rename_folder = "../../../../output/parse_result/" + folder + "/" os.rename(rename_folder, destination) break destination_test = "../../unit_testing/actual_result/" + pdffile_name[ 0] + "/" if test_flag == True: destination = destination_test print_log( "unittest flag triggered. destination for output become:") print_log(destination_test) if not os.path.exists(destination): os.makedirs(destination) json_content_output = {} if os.path.isfile(destination + "/output.json"): #update with open(destination + "output.json") as f: json_content_output = json.load(f) for key in json_content_output: if key not in json_content: json_append(key, json_content_output[key]) f.close() os.remove(destination + "/output.json") print_json(destination) #Unit test if os.path.isfile(destination + "/output.json") == False: print_log("Error Report!\n") print_log("output folder " + destination + "is empty. output.json is not properly output!") return out_dir_name #if not Found_abstract: # print "Automatically running the program again with the option --caps" # automatic_rerunning=True # pdf2heads(opts, [args[0]],args[1]) except UsageError as err: print >> sys.stderr, err.msg print >> sys.stderr, "for help use --help" return 2 except ConfigError, err: sys.stderr.writelines([str(err.msg), '\n']) sys.stderr.flush() return 1