Beispiel #1
0
def setprocesses():
    MAX_PROCESSES = int(request.form.get('processes'))
    print "Set number of parallel to " + str(
        MAX_PROCESSES) + " processes at timestamp: " + datetime.now().strftime(
            '%Y%m%d%H%M%S')
    writeUserLog("Parallel number of processes set to " + str(MAX_PROCESSES))
    #return redirect("/")
    return updateHTML()
Beispiel #2
0
def fileuploadText(UPLOAD_FOLDER, OUTPUT_FOLDER, settings, filename):
    #checks the file extension and preprocesses the file based on it
    if "True" in settings[1]:
        Txt_Dummy = True
    else:
        Txt_Dummy = False

    prepareText(UPLOAD_FOLDER, Txt_Dummy, filename)

    mode = "extract_meta"
    if Txt_Dummy:
        mode = "extract_citations"

    p = subprocess.Popen([
        "./citeExtract.pl -m" + mode + " " + LOCDB + "tmp/" + filename +
        "_Textdummy.txt ", LOCDB + "tmp/" + filename + "_ParsText.xml"
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         cwd=parsCit)
    parscitstring = p.communicate()[0]
    with open("tmp/" + filename + "_ParsText.xml", 'w') as f:
        f.write(parscitstring)

    outputxmlsoup = createBibstruct(filename)

    if filename[-3:].lower() == "pdf":
        output_grobid = processfileGrobid(UPLOAD_FOLDER, filename)
        xmltags3 = output_grobid.find_all('BibStructured')

        algotag3 = outputxmlsoup.algorithm
        for curr in xmltags3:
            algotag3.append(curr)

    os.system("mv " + LOCDB + "tmp/" + filename + '_Textdummy.txt ' + LOCDB +
              "processed-files/" + filename + '_Textdummy.txt')
    os.system("mv " + LOCDB + "tmp/" + filename + '_ParsText.txt ' + LOCDB +
              "processed-files/" + filename + '_ParsText.txt')

    os.makedirs(OUTPUT_FOLDER + filename)
    with open(OUTPUT_FOLDER + filename + "/Output" + filename + '.xml',
              'w') as xmlf:
        xmlf.write(outputxmlsoup.encode('utf-8'))

    settings = []
    settings.append("TXT")
    settings.append(Txt_Dummy)
    writeLog(filename, settings, True)
    os.remove(UPLOAD_FOLDER + filename)
    os.makedirs(OUTPUT_FOLDER + filename)
    outputfile = OUTPUT_FOLDER + filename + "/Output" + filename + '.xml'
    outputfilename = filename.replace(filename.split("_")[0], "")[1:]
    if os.path.exists(outputfile):
        print "Finished inputfile : " + outputfilename
        writeUserLog("Finished inputfile : " + outputfilename)
    else:
        print "Error inputfile : " + outputfilename
        writeUserLog("Error inputfile : " + outputfilename)
Beispiel #3
0
def fileupload():
    #checks the upload and stores it to the upload folder
    if request.method == 'POST':
        if 'files' not in request.files:
            flash('No file part')
            return redirect(request.url)
        #requested files and pdf mode are retrieved from flask
        fileList = request.files.getlist('files')
        if request.form.get('pdfFlag'):
            Settings[0] = "IMG"
        else:
            Settings[0] = "TXT"
        if request.form.get('Txt_Dummy'):
            Settings[1] = "True"
        else:
            Settings[1] = "False"
        #if request.form.get('colBool'):
        #Settings[2] = str(int(request.form.get('colNumb')) - 1)
        #else:
        #Settings[2] = "0"
        autoview = False
        if request.form.get('autoviewResults'):
            autoview = True
        filenameFP_List = []
        filenameString = ""
        for uploadedFile in fileList:
            if uploadedFile.filename == '':
                flash('No selected inputfile')
                return redirect(request.url)
            if uploadedFile and check_file_extension(uploadedFile.filename,
                                                     ALLOWED_EXTENSIONS):
                filenameFP = secure_filename(uploadedFile.filename)
                print 'Uploaded inputfile : ' + filenameFP
                writeUserLog("Uploaded inputfile : " + filenameFP)
                #adding timestamp
                ts = datetime.now().strftime('%Y%m%d%H%M%S')
                filenameFP = ts + "_" + filenameFP
                writeLog(filenameFP, Settings, False)
                uploadedFile.save(os.path.join(UPLOAD_FOLDER, filenameFP))
                filenameFP_List.append(filenameFP)
                filenameString += filenameFP + "\n"

        processFile(UPLOAD_FOLDER, OUTPUT_FOLDER, MAX_PROCESSES, Settings,
                    filenameFP_List)

        #sync process
        result = createResultView(OUTPUT_FOLDER, filenameFP_List)
        return Response(result, content_type='text/xml; charset=utf-8')

        if autoview:
            return render_template("form_submitocr.html",
                                   waiting="1",
                                   filesText=filenameString)
        else:
            return updateHTML()

    return "Error"
Beispiel #4
0
def deleteoutput():
    if os.path.exists("output/"):
        shutil.rmtree("output/")
    os.makedirs("output/")
    print "Output removed at timestamp: " + datetime.now().strftime(
        '%Y%m%d%H%M%S')
    writeUserLog("Output removed")
    #return redirect("/")
    return updateHTML()
def fileuploadText(UPLOAD_FOLDER, OUTPUT_FOLDER, settings, filename):

    #checks the file extension and preprocesses the file based on it
    if "True" in settings[1]:
        Txt_Dummy = True
    else:
        Txt_Dummy = False

    prepareText(UPLOAD_FOLDER, Txt_Dummy, filename)

    mode = "extract_meta"
    if Txt_Dummy:
        mode = "extract_citations"

    p = subprocess.Popen([
        "./citeExtract.pl -m" + mode + " " + LOCDB + "tmp/" + filename +
        "_Textdummy.txt ", LOCDB + "tmp/" + filename + "_ParsText.xml"
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         cwd=parsCit)
    parscitstring = p.communicate()[0]
    with open("tmp/" + filename + "_ParsText.xml", 'w') as f:
        f.write(parscitstring)

    outputxmlsoup = createBibstruct(filename)

    #delete tmp files
    os.remove("tmp/" + filename + '_Textdummy.txt')
    os.remove("tmp/" + filename + '_ParsText.xml')

    os.makedirs(OUTPUT_FOLDER + filename)
    with open(OUTPUT_FOLDER + filename + "/Output" + filename + '.xml',
              'w') as xmlf:
        xmlf.write(outputxmlsoup.encode('utf-8'))

    settings = []
    settings.append("TXT")
    settings.append(Txt_Dummy)
    writeLog(filename, settings, True)
    #writeCorrect(filename)

    os.remove(UPLOAD_FOLDER + filename)

    os.makedirs(OUTPUT_FOLDER + filename)
    outputfile = OUTPUT_FOLDER + filename + "/Output" + filename + '.xml'
    outputfilename = filename.replace(filename.split("_")[0], "")[1:]
    if os.path.exists(outputfile):
        print "Finished inputfile : " + outputfilename
        writeUserLog("Finished inputfile : " + outputfilename)
    else:
        print "Error inputfile : " + outputfilename
        writeUserLog("Error inputfile : " + outputfilename)
Beispiel #6
0
def restoreprocess():
    if os.path.exists("tmp/"):
        shutil.rmtree("tmp/")
    os.makedirs("tmp/")
    ocroFiles = ocropy + "/processedFiles/"
    if os.path.exists(ocroFiles):
        shutil.rmtree(ocroFiles)
    print "Restarted at timestamp: " + datetime.now().strftime('%Y%m%d%H%M%S')
    writeUserLog("Restarted the app")
    processRestore(UPLOAD_FOLDER, OUTPUT_FOLDER, MAX_PROCESSES)
    #return redirect('/')
    return updateHTML()
def fileuploadXML(UPLOAD_FOLDER, OUTPUT_FOLDER, filename):

    #open input file
    with open(UPLOAD_FOLDER + filename, 'r') as f:
        xmlsoup = bs.BeautifulSoup(f.read(), 'xml')

    prepareXML(xmlsoup, filename)
    p = subprocess.Popen([
        "./citeExtract.pl -m extract_citations " + LOCDB + "tmp/" + filename +
        "_XMLdummy.txt ", LOCDB + "tmp/" + filename + "_ParsXML.xml"
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         cwd=parsCit)
    parscitstring = p.communicate()[0]
    with open("tmp/" + filename + "_ParsXML.xml", 'w') as f:
        f.write(parscitstring)

    outputxmlsoup = createBibstruct(xmlsoup, filename)

    #delete tmp files
    os.remove("tmp/" + filename + '_XMLdummy.txt')
    os.remove("tmp/" + filename + '_ParsXML.xml')

    os.makedirs(OUTPUT_FOLDER + filename)
    with open(OUTPUT_FOLDER + filename + "/Output" + filename + '.xml',
              'w') as xmlf:
        xmlf.write(outputxmlsoup.encode('utf-8'))

    settings = []
    settings.append("XML")
    writeLog(filename, settings, True)
    #writeCorrect(filename)

    os.remove(UPLOAD_FOLDER + filename)

    outputfile = OUTPUT_FOLDER + filename + "/Output" + filename + '.xml'
    outputfilename = filename.replace(filename.split("_")[0], "")[1:]
    if os.path.exists(outputfile):
        print "Finished inputfile : " + outputfilename
        writeUserLog("Finished inputfile : " + outputfilename)
    else:
        print "Error inputfile : " + outputfilename
        writeUserLog("Error inputfile : " + outputfilename)
Beispiel #8
0
def resetapp():
    if os.path.exists("upload/"):
        shutil.rmtree("upload/")
    if os.path.exists("tmp/"):
        shutil.rmtree("tmp/")
    ocroFiles = ocropy + "/processedFiles/"
    if os.path.exists(ocroFiles):
        shutil.rmtree(ocroFiles)
    if os.path.exists("processLog.txt"):
        os.remove("processLog.txt")
    if os.path.exists("visibleUserLog.txt"):
        os.remove("visibleUserLog.txt")
    os.makedirs("upload/")
    os.makedirs("tmp/")
    with open("processLog.txt", "w") as f:
        f.write("")
    with open("visibleUserLog.txt", "w") as f:
        f.write("")
    print "Cleared App at timestamp: " + datetime.now().strftime(
        '%Y%m%d%H%M%S')
    writeUserLog("Resetted the app")
    #return redirect("/")
    return updateHTML()
Beispiel #9
0
def fileupload():
    if debugMode.lower() == "yes":
        print ""
        print "####################################"
        print "In fileupload()"
        print "####################################"
        print ""

    #checks the upload request parameters and stores it to the upload folder
    if request.method == 'POST':
        if 'files' not in request.files:
            flash('No file part')
            return redirect(request.url)
        #requested files and pdf mode are retrieved from flask
        fileList = request.files.getlist('files')
        if request.form.get('pdfFlag'):
            Settings[0] = "IMG"
        else:
            Settings[0] = "TXT"
        if request.form.get('Txt_Dummy'):
            Settings[1] = "True"
        else:
            Settings[1] = "False"

        autoview = False
        if request.form.get('autoviewResults'):
            autoview = True
        filenameFP_List = []
        filenameString = ""
        for uploadedFile in fileList:
            if uploadedFile.filename == '':
                flash('No selected inputfile')
                return redirect(request.url)
            if uploadedFile and check_file_extension(uploadedFile.filename,
                                                     ALLOWED_EXTENSIONS):
                filenameFP = secure_filename(uploadedFile.filename)
                print 'Uploaded inputfile : ' + filenameFP
                print ""
                writeUserLog("Uploaded inputfile : " + filenameFP)
                #adding timestamp
                ts = datetime.now().strftime('%Y%m%d%H%M%S')
                filenameFP = ts + "_" + filenameFP
                writeLog(filenameFP, Settings, False)
                uploadedFile.save(
                    os.path.join(LOCDB + UPLOAD_FOLDER, filenameFP))
                filenameFP_List.append(filenameFP)
                filenameString += filenameFP + "\n"
            else:
                return "Error: Invalid file extension..."
        try:
            job = q.enqueue_call(func=processFile,
                                 args=(
                                     UPLOAD_FOLDER,
                                     OUTPUT_FOLDER,
                                     MAX_PROCESSES,
                                     Settings,
                                     filenameFP_List,
                                 ),
                                 result_ttl=8000,
                                 timeout=80000)
            print(job.get_id())

            return job.get_id()

            #sync process
            filenameFP_List = natsort.natsorted(filenameFP_List)
            result = createResultView(OUTPUT_FOLDER, filenameFP_List)
            return Response(result, content_type='text/xml; charset=utf-8')
        except:
            return "An Error occured during file processing..."

        if autoview:
            return render_template("form_submitocr.html",
                                   waiting="1",
                                   filesText=filenameString)
        else:
            return updateHTML()

    return "Error"
Beispiel #10
0
def form():
    writeUserLog("App started")
    return updateHTML()
Beispiel #11
0
def form():
    #return render_template('form_submitocr.html')
    writeUserLog("App started")
    return updateHTML()
Beispiel #12
0
def fileuploadXML(UPLOAD_FOLDER, OUTPUT_FOLDER, filename):
    mapOutputsoup = None
    imgOutputXmlSoup = None

    #open input file
    with open(UPLOAD_FOLDER + filename, 'r') as f:
        xmlsoup = bs.BeautifulSoup(f.read(), 'xml')
    response = False
    if filename[-3:].lower() == "xml":
        prepareXML(xmlsoup, filename)
    else:
        response = prepareHTML(xmlsoup, filename)
        if response == True:
            mapOutputsoup = mapHTML(xmlsoup, filename)

    p = subprocess.Popen([
        "./citeExtract.pl -m extract_citations " + LOCDB + "tmp/" + filename +
        "_XMLdummy.txt ", LOCDB + "tmp/" + filename + "_ParsXML.xml"
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         cwd=parsCit)
    parscitstring = p.communicate()[0]
    with open("tmp/" + filename + "_ParsXML.xml", 'w') as f:
        f.write(parscitstring)

    if filename[:-3].lower() == "xml":
        outputxmlsoup = createBibstruct(xmlsoup, filename)
    else:
        outputxmlsoup = createBibstructHTML(filename)

    if mapOutputsoup != None:
        bibtags2 = mapOutputsoup.find_all('BibStructured')
        algotag2 = outputxmlsoup.algorithm
        for currtag in bibtags2:
            algotag2.append(currtag)

    if imgOutputXmlSoup != None:
        bibtags2 = imgOutputXmlSoup.find_all('BibStructured')
        algotag2 = outputxmlsoup.algorithm
        for currtag in bibtags2:
            algotag2.append(currtag)

    os.system("mv " + LOCDB + "tmp/" + filename + '_XMLdummy.txt ' + LOCDB +
              "processed-files/" + filename + '_XMLdummy.txt')
    os.system("mv " + LOCDB + "tmp/" + filename + '_ParsXML.xml ' + LOCDB +
              "processed-files/" + filename + '_ParsXML.xml')

    os.makedirs(LOCDB + OUTPUT_FOLDER + filename)
    with open(LOCDB + OUTPUT_FOLDER + filename + "/Output" + filename + '.xml',
              'w') as xmlf:
        xmlf.write(outputxmlsoup.encode('utf-8'))

    settings = []
    settings.append("XML")
    writeLog(filename, settings, True)
    os.remove(UPLOAD_FOLDER + filename)
    outputfile = OUTPUT_FOLDER + filename + "/Output" + filename + '.xml'
    outputfilename = filename.replace(filename.split("_")[0], "")[1:]
    if os.path.exists(outputfile):
        print "Finished inputfile : " + outputfilename
        writeUserLog("Finished inputfile : " + outputfilename)
    else:
        print "Error inputfile : " + outputfilename
        writeUserLog("Error inputfile : " + outputfilename)
Beispiel #13
0
def fileuploadIMG(UPLOAD_FOLDER, OUTPUT_FOLDER, settings, filename):
    
    #columnNumber = int(settings[2])
    if not os.path.exists(ocropy + "/processedFiles/"):
        os.makedirs(ocropy + "/processedFiles/")
    
    #check if pdf
    pdfFlag = False
    im = 0
    if '.' in filename and filename.rsplit('.', 1)[1] in set(['pdf']):
        pdfFlag = True
    else:
        #check if image is valid
        im = Image.open(UPLOAD_FOLDER + filename)
        im = im.getcolors()
        
    if pdfFlag or im == None or len(im) != 0:        
        #process the image file extract text of it
        prepareIMG(UPLOAD_FOLDER, OUTPUT_FOLDER, filename)
        #prepareIMG(UPLOAD_FOLDER, OUTPUT_FOLDER, filename, columnNumber)
    
        p = subprocess.Popen(["./citeExtract.pl -m extract_citations " + ocropy +"/processedFiles/" +filename+"/ocrWdummy.txt", LOCDB + "tmp/" + filename +"_ParsIMG.xml"],shell=True,stdout=subprocess.PIPE,cwd=parsCit)
        parscitstring= p.communicate()[0]
        
        with open("tmp/" + filename + "_ParsIMG.xml", 'w') as f:
            f.write(parscitstring)
        
        outputxmlsoup = createBibstruct(filename)
        
        #delete tmp files
        os.remove("tmp/" + filename+'_ParsIMG.xml')
            
        #with open(OUTPUT_FOLDER + "Output" + filename +'.xml','w') as xmlf:
        #    xmlf.write(outputxmlsoup.encode('utf-8'))    
        with open(ocropy + "/processedFiles/" + filename + '/xmloutput.xml','w') as xmlf:
            xmlf.write(outputxmlsoup.encode('utf-8'))
        
        #check if html useful
        if os.path.exists(ocropy + "/processedFiles/"+filename+'/temp.html'):
            copyfile(ocropy + "/processedFiles/" + filename + "/xmloutput.xml", OUTPUT_FOLDER + filename + "/Output" + filename + ".xml")
            copyfile(ocropy + "/processedFiles/" + filename + "/temp.html", OUTPUT_FOLDER + filename + "/filenameTemp.html")
            copyfile(ocropy + "/processedFiles/" + filename + "/tempcorrection.html", OUTPUT_FOLDER + filename + "/filenameTempcorrection.html")  
        
        #uncomment to remove tmp data images
        shutil.rmtree(ocropy + "/processedFiles/" + filename)

    settings = []
    settings.append("IMG")
    #settings.append(columnNumber)
    writeLog(filename, settings, True)
    #writeCorrect(filename)
    
    os.remove(UPLOAD_FOLDER + filename)
    
    outputfile = OUTPUT_FOLDER + filename + "/Output" + filename+'.xml'
    outputfilename = filename.replace(filename.split("_")[0], "")[1:]
    if os.path.exists(outputfile):
        print "Finished inputfile : " + outputfilename
        writeUserLog("Finished inputfile : " + outputfilename) 
    else:
        print "Error inputfile : " + outputfilename
        writeUserLog("Error inputfile : " + outputfilename)