Ejemplo n.º 1
0
def fileuploadIMG(UPLOAD_FOLDER, OUTPUT_FOLDER, settings, filename):
    #try:
    columnNumber = int(settings[2])
    if not os.path.exists(ocropy + "/processedFiles/"):
        os.makedirs(ocropy + "/processedFiles/")
    
    #check if pdf
    pdfFlag = False
    im = 0
    if '.' in filename and filename.rsplit('.', 1)[1] in set(['pdf']):
        pdfFlag = True
    else:
        #check if image is valid
        im = Image.open(UPLOAD_FOLDER + filename)
        im = im.getcolors()
        
    if pdfFlag or im == None or len(im) != 0:        
        #process the image file extract text of it
        prepareIMG(UPLOAD_FOLDER, OUTPUT_FOLDER, filename, columnNumber)
    
        #stop if no text found
        if os.path.exists(ocropy + "/processedFiles/"+filename+'/ocrWdummy.txt'):
        
            p = subprocess.Popen(["./citeExtract.pl -m extract_citations " + ocropy +"/processedFiles/" +filename+"/ocrWdummy.txt", LOCDB + "tmp/" + filename +"_ParsIMG.xml"],shell=True,stdout=subprocess.PIPE,cwd=parsCit)
            parscitstring= p.communicate()[0]
            with open("tmp/" + filename + "_ParsIMG.xml", 'w') as f:
                f.write(parscitstring)
            
            outputxmlsoup = createBibstruct(filename)
            
            #delete tmp files
            os.remove("tmp/" + filename+'_ParsIMG.xml')
                
            #with open(OUTPUT_FOLDER + "Output" + filename +'.xml','w') as xmlf:
            #    xmlf.write(outputxmlsoup.encode('utf-8'))    
            with open(ocropy + "/processedFiles/" + filename + '/xmloutput.xml','w') as xmlf:
                xmlf.write(outputxmlsoup.encode('utf-8'))
            
            copyfile(ocropy + "/processedFiles/" + filename + "/xmloutput.xml", OUTPUT_FOLDER + filename + "/Output" + filename + ".xml")
            copyfile(ocropy + "/processedFiles/" + filename + "/temp.html", OUTPUT_FOLDER + filename + "/filenameTemp.html")
            copyfile(ocropy + "/processedFiles/" + filename + "/tempcorrection.html", OUTPUT_FOLDER + filename + "/filenameTempcorrection.html")  
            
            #uncomment to remove tmp data images
            shutil.rmtree(ocropy + "/processedFiles/" + filename)

    settings = []
    settings.append("IMG")
    settings.append(columnNumber)
    writeLog(filename, settings, True)
    #writeCorrect(filename)
    
    os.remove(UPLOAD_FOLDER + filename)
    
    outputfile = OUTPUT_FOLDER + filename + "/Output" + filename+'.xml'
    outputfilename = filename.replace(filename.split("_")[0], "")[1:]
    if os.path.exists(outputfile):
        print "Finished inputfile : " + outputfilename 
    else:
        print "Error inputfile : " + outputfilename
Ejemplo n.º 2
0
def fileuploadText(UPLOAD_FOLDER, OUTPUT_FOLDER, settings, filename):
    #checks the file extension and preprocesses the file based on it
    if "True" in settings[1]:
        Txt_Dummy = True
    else:
        Txt_Dummy = False

    prepareText(UPLOAD_FOLDER, Txt_Dummy, filename)

    mode = "extract_meta"
    if Txt_Dummy:
        mode = "extract_citations"

    p = subprocess.Popen([
        "./citeExtract.pl -m" + mode + " " + LOCDB + "tmp/" + filename +
        "_Textdummy.txt ", LOCDB + "tmp/" + filename + "_ParsText.xml"
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         cwd=parsCit)
    parscitstring = p.communicate()[0]
    with open("tmp/" + filename + "_ParsText.xml", 'w') as f:
        f.write(parscitstring)

    outputxmlsoup = createBibstruct(filename)

    if filename[-3:].lower() == "pdf":
        output_grobid = processfileGrobid(UPLOAD_FOLDER, filename)
        xmltags3 = output_grobid.find_all('BibStructured')

        algotag3 = outputxmlsoup.algorithm
        for curr in xmltags3:
            algotag3.append(curr)

    os.system("mv " + LOCDB + "tmp/" + filename + '_Textdummy.txt ' + LOCDB +
              "processed-files/" + filename + '_Textdummy.txt')
    os.system("mv " + LOCDB + "tmp/" + filename + '_ParsText.txt ' + LOCDB +
              "processed-files/" + filename + '_ParsText.txt')

    os.makedirs(OUTPUT_FOLDER + filename)
    with open(OUTPUT_FOLDER + filename + "/Output" + filename + '.xml',
              'w') as xmlf:
        xmlf.write(outputxmlsoup.encode('utf-8'))

    settings = []
    settings.append("TXT")
    settings.append(Txt_Dummy)
    writeLog(filename, settings, True)
    os.remove(UPLOAD_FOLDER + filename)
    os.makedirs(OUTPUT_FOLDER + filename)
    outputfile = OUTPUT_FOLDER + filename + "/Output" + filename + '.xml'
    outputfilename = filename.replace(filename.split("_")[0], "")[1:]
    if os.path.exists(outputfile):
        print "Finished inputfile : " + outputfilename
        writeUserLog("Finished inputfile : " + outputfilename)
    else:
        print "Error inputfile : " + outputfilename
        writeUserLog("Error inputfile : " + outputfilename)
Ejemplo n.º 3
0
def fileupload():
    #checks the upload and stores it to the upload folder
    if request.method == 'POST':
        if 'files' not in request.files:
            flash('No file part')
            return redirect(request.url)
        #requested files and pdf mode are retrieved from flask
        fileList = request.files.getlist('files')
        if request.form.get('pdfFlag'):
            Settings[0] = "IMG"
        else:
            Settings[0] = "TXT"
        if request.form.get('Txt_Dummy'):
            Settings[1] = "True"
        else:
            Settings[1] = "False"
        #if request.form.get('colBool'):
        #Settings[2] = str(int(request.form.get('colNumb')) - 1)
        #else:
        #Settings[2] = "0"
        autoview = False
        if request.form.get('autoviewResults'):
            autoview = True
        filenameFP_List = []
        filenameString = ""
        for uploadedFile in fileList:
            if uploadedFile.filename == '':
                flash('No selected inputfile')
                return redirect(request.url)
            if uploadedFile and check_file_extension(uploadedFile.filename,
                                                     ALLOWED_EXTENSIONS):
                filenameFP = secure_filename(uploadedFile.filename)
                print 'Uploaded inputfile : ' + filenameFP
                writeUserLog("Uploaded inputfile : " + filenameFP)
                #adding timestamp
                ts = datetime.now().strftime('%Y%m%d%H%M%S')
                filenameFP = ts + "_" + filenameFP
                writeLog(filenameFP, Settings, False)
                uploadedFile.save(os.path.join(UPLOAD_FOLDER, filenameFP))
                filenameFP_List.append(filenameFP)
                filenameString += filenameFP + "\n"

        processFile(UPLOAD_FOLDER, OUTPUT_FOLDER, MAX_PROCESSES, Settings,
                    filenameFP_List)

        #sync process
        result = createResultView(OUTPUT_FOLDER, filenameFP_List)
        return Response(result, content_type='text/xml; charset=utf-8')

        if autoview:
            return render_template("form_submitocr.html",
                                   waiting="1",
                                   filesText=filenameString)
        else:
            return updateHTML()

    return "Error"
def fileuploadText(UPLOAD_FOLDER, OUTPUT_FOLDER, settings, filename):
    #try:
    #checks the file extension and preprocesses the file based on it
    if "True" in settings[1]:
        Txt_Dummy = True
    else:
        Txt_Dummy = False

    prepareText(UPLOAD_FOLDER, Txt_Dummy, filename)

    mode = "extract_meta"
    if Txt_Dummy:
        mode = "extract_citations"

    p = subprocess.Popen([
        "./citeExtract.pl -m" + mode + " " + LOCDB + "tmp/" + filename +
        "_Textdummy.txt ", LOCDB + "tmp/" + filename + "_ParsText.xml"
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         cwd=parsCit)
    parscitstring = p.communicate()[0]
    with open("tmp/" + filename + "_ParsText.xml", 'w') as f:
        f.write(parscitstring)

    outputxmlsoup = createBibstruct(filename)

    #delete tmp files
    os.remove("tmp/" + filename + '_Textdummy.txt')
    os.remove("tmp/" + filename + '_ParsText.xml')

    with open(OUTPUT_FOLDER + "Output" + filename + '.xml', 'w') as xmlf:
        xmlf.write(outputxmlsoup.encode('utf-8'))

    settings = []
    settings.append("TXT")
    settings.append(Txt_Dummy)
    writeLog(filename, settings, True)
    #writeCorrect(filename)

    os.remove(UPLOAD_FOLDER + filename)

    #create output folder for image file
    if not os.path.exists(OUTPUT_FOLDER + filename):
        os.makedirs(OUTPUT_FOLDER + filename + "/")

    #outputfile = OUTPUT_FOLDER + "Output" + filename+'.xml'
    outputfile = output_folder + filename + '/Output' + filename + ".xml"
    outputfilename = filename.replace(filename.split("_")[0], "")[1:]
    if os.path.exists(outputfile):
        print "Finished inputfile : " + outputfilename
    else:
        print "Error inputfile : " + outputfilename
Ejemplo n.º 5
0
def fileupload():
    print ""
    print ""
    print "----------------------------------------"
    print "request: ", request
    print "request.files: ", request.files
    print "request.url: ", request.url
    print "request.form: ", request.form
    print "----------------------------------------"
    print ""
    print ""
    #checks the upload and stores it to the upload folder
    if request.method == 'POST':
        if 'files' not in request.files:
            flash('No file part')
            return redirect(request.url)
        #requested files and pdf mode are retrieved from flask
        fileList = request.files.getlist('files')
        if request.form.get('pdfFlag'):
            Settings[0] = "IMG"
        else:
            Settings[0] = "TXT"
        if request.form.get('Txt_Dummy'):
            Settings[1] = "True"
        else:
            Settings[1] = "False"
        if request.form.get('colBool'):
            Settings[2] = str(int(request.form.get('colNumb')) - 1)
        else:
            Settings[2] = "0"
        filenameFP_List = []
        for uploadedFile in fileList:
            if uploadedFile.filename == '':
                flash('No selected inputfile')
                return redirect(request.url)
            if uploadedFile and check_file_extension(uploadedFile.filename,
                                                     ALLOWED_EXTENSIONS):
                filenameFP = secure_filename(uploadedFile.filename)
                print 'Uploaded inputfile : ' + filenameFP
                #adding timestamp
                ts = datetime.now().strftime('%m%d%H%M%S')
                filenameFP = ts + "_" + filenameFP
                writeLog(filenameFP, Settings, False)
                uploadedFile.save(os.path.join(UPLOAD_FOLDER, filenameFP))
                filenameFP_List.append(filenameFP)
        processFile(UPLOAD_FOLDER, OUTPUT_FOLDER, MAX_PROCESSES, Settings,
                    filenameFP_List)
        return mergeOutputXML(OUTPUT_FOLDER, filenameFP_List)
        #return redirect("/")

    return "Error"
def fileuploadXML(UPLOAD_FOLDER, OUTPUT_FOLDER, filename):

    #open input file
    with open(UPLOAD_FOLDER + filename, 'r') as f:
        xmlsoup = bs.BeautifulSoup(f.read(), 'xml')

    prepareXML(xmlsoup, filename)
    p = subprocess.Popen([
        "./citeExtract.pl -m extract_citations " + LOCDB + "tmp/" + filename +
        "_XMLdummy.txt ", LOCDB + "tmp/" + filename + "_ParsXML.xml"
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         cwd=parsCit)
    parscitstring = p.communicate()[0]
    with open("tmp/" + filename + "_ParsXML.xml", 'w') as f:
        f.write(parscitstring)

    outputxmlsoup = createBibstruct(xmlsoup, filename)

    #delete tmp files
    os.remove("tmp/" + filename + '_XMLdummy.txt')
    os.remove("tmp/" + filename + '_ParsXML.xml')

    os.makedirs(OUTPUT_FOLDER + filename)
    with open(OUTPUT_FOLDER + filename + "/Output" + filename + '.xml',
              'w') as xmlf:
        xmlf.write(outputxmlsoup.encode('utf-8'))

    settings = []
    settings.append("XML")
    writeLog(filename, settings, True)
    #writeCorrect(filename)

    os.remove(UPLOAD_FOLDER + filename)

    outputfile = OUTPUT_FOLDER + filename + "/Output" + filename + '.xml'
    outputfilename = filename.replace(filename.split("_")[0], "")[1:]
    if os.path.exists(outputfile):
        print "Finished inputfile : " + outputfilename
        writeUserLog("Finished inputfile : " + outputfilename)
    else:
        print "Error inputfile : " + outputfilename
        writeUserLog("Error inputfile : " + outputfilename)
Ejemplo n.º 7
0
def fileupload():
    if debugMode.lower() == "yes":
        print ""
        print "####################################"
        print "In fileupload()"
        print "####################################"
        print ""

    #checks the upload request parameters and stores it to the upload folder
    if request.method == 'POST':
        if 'files' not in request.files:
            flash('No file part')
            return redirect(request.url)
        #requested files and pdf mode are retrieved from flask
        fileList = request.files.getlist('files')
        if request.form.get('pdfFlag'):
            Settings[0] = "IMG"
        else:
            Settings[0] = "TXT"
        if request.form.get('Txt_Dummy'):
            Settings[1] = "True"
        else:
            Settings[1] = "False"

        autoview = False
        if request.form.get('autoviewResults'):
            autoview = True
        filenameFP_List = []
        filenameString = ""
        for uploadedFile in fileList:
            if uploadedFile.filename == '':
                flash('No selected inputfile')
                return redirect(request.url)
            if uploadedFile and check_file_extension(uploadedFile.filename,
                                                     ALLOWED_EXTENSIONS):
                filenameFP = secure_filename(uploadedFile.filename)
                print 'Uploaded inputfile : ' + filenameFP
                print ""
                writeUserLog("Uploaded inputfile : " + filenameFP)
                #adding timestamp
                ts = datetime.now().strftime('%Y%m%d%H%M%S')
                filenameFP = ts + "_" + filenameFP
                writeLog(filenameFP, Settings, False)
                uploadedFile.save(
                    os.path.join(LOCDB + UPLOAD_FOLDER, filenameFP))
                filenameFP_List.append(filenameFP)
                filenameString += filenameFP + "\n"
            else:
                return "Error: Invalid file extension..."
        try:
            job = q.enqueue_call(func=processFile,
                                 args=(
                                     UPLOAD_FOLDER,
                                     OUTPUT_FOLDER,
                                     MAX_PROCESSES,
                                     Settings,
                                     filenameFP_List,
                                 ),
                                 result_ttl=8000,
                                 timeout=80000)
            print(job.get_id())

            return job.get_id()

            #sync process
            filenameFP_List = natsort.natsorted(filenameFP_List)
            result = createResultView(OUTPUT_FOLDER, filenameFP_List)
            return Response(result, content_type='text/xml; charset=utf-8')
        except:
            return "An Error occured during file processing..."

        if autoview:
            return render_template("form_submitocr.html",
                                   waiting="1",
                                   filesText=filenameString)
        else:
            return updateHTML()

    return "Error"
Ejemplo n.º 8
0
def fileuploadXML(UPLOAD_FOLDER, OUTPUT_FOLDER, filename):
    mapOutputsoup = None
    imgOutputXmlSoup = None

    #open input file
    with open(UPLOAD_FOLDER + filename, 'r') as f:
        xmlsoup = bs.BeautifulSoup(f.read(), 'xml')
    response = False
    if filename[-3:].lower() == "xml":
        prepareXML(xmlsoup, filename)
    else:
        response = prepareHTML(xmlsoup, filename)
        if response == True:
            mapOutputsoup = mapHTML(xmlsoup, filename)

    p = subprocess.Popen([
        "./citeExtract.pl -m extract_citations " + LOCDB + "tmp/" + filename +
        "_XMLdummy.txt ", LOCDB + "tmp/" + filename + "_ParsXML.xml"
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         cwd=parsCit)
    parscitstring = p.communicate()[0]
    with open("tmp/" + filename + "_ParsXML.xml", 'w') as f:
        f.write(parscitstring)

    if filename[:-3].lower() == "xml":
        outputxmlsoup = createBibstruct(xmlsoup, filename)
    else:
        outputxmlsoup = createBibstructHTML(filename)

    if mapOutputsoup != None:
        bibtags2 = mapOutputsoup.find_all('BibStructured')
        algotag2 = outputxmlsoup.algorithm
        for currtag in bibtags2:
            algotag2.append(currtag)

    if imgOutputXmlSoup != None:
        bibtags2 = imgOutputXmlSoup.find_all('BibStructured')
        algotag2 = outputxmlsoup.algorithm
        for currtag in bibtags2:
            algotag2.append(currtag)

    os.system("mv " + LOCDB + "tmp/" + filename + '_XMLdummy.txt ' + LOCDB +
              "processed-files/" + filename + '_XMLdummy.txt')
    os.system("mv " + LOCDB + "tmp/" + filename + '_ParsXML.xml ' + LOCDB +
              "processed-files/" + filename + '_ParsXML.xml')

    os.makedirs(LOCDB + OUTPUT_FOLDER + filename)
    with open(LOCDB + OUTPUT_FOLDER + filename + "/Output" + filename + '.xml',
              'w') as xmlf:
        xmlf.write(outputxmlsoup.encode('utf-8'))

    settings = []
    settings.append("XML")
    writeLog(filename, settings, True)
    os.remove(UPLOAD_FOLDER + filename)
    outputfile = OUTPUT_FOLDER + filename + "/Output" + filename + '.xml'
    outputfilename = filename.replace(filename.split("_")[0], "")[1:]
    if os.path.exists(outputfile):
        print "Finished inputfile : " + outputfilename
        writeUserLog("Finished inputfile : " + outputfilename)
    else:
        print "Error inputfile : " + outputfilename
        writeUserLog("Error inputfile : " + outputfilename)