예제 #1
0
파일: i2graph.py 프로젝트: saoruy/DCC
def run(input_path, op_path, ontology_file, model_dir):
    i2g_output_dir = os.path.join(op_path, "image2graph")
    if not os.path.exists(i2g_output_dir):
        os.makedirs(i2g_output_dir)
        
    op_path_all = os.path.join(i2g_output_dir, "all_images")
    
    if not os.path.exists(op_path_all):
        os.makedirs(op_path_all)
    
    # command = 'java -cp "pdffigures2_2.12-0.1.0.jar;pdffigures2-assembly-0.1.0-deps.jar;scala-library.jar" org.allenai.pdffigures2.FigureExtractorBatchCli Input/ -s stat_file.json -m out/ -d out/'
    command = 'java -cp "pdffigures2_2.12-0.1.0.jar;pdffigures2-assembly-0.1.0-deps.jar;scala-library.jar" org.allenai.pdffigures2.FigureExtractorBatchCli ' + input_path + '/ -s stat_file.json -m ' + op_path_all + '/ -d ' + op_path_all + '\\'
    # , stderr=subprocess.PIPE
    process = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
#    print(stderr)
        
    print("[INFO] Loading trained models ...")
            
    figtypedetector = ftd(model_dir)
    figtypedetector.loadFigClassModels("vgg16")

    print("[INFO] Loading and analyzing images ...")

    f = open(os.path.join(model_dir, 'full_annotations.pcl'), 'rb')
    [entity_map, uri2entity, uri2rel]=pickle.load(f)
    f.close()
    
    for filename in glob(os.path.join(op_path_all, '*png')):
        if (filename.find('Figure') != -1): 
            parsejson = pj()
            # paper_title, paper_file_name, paper_conf, paper_year, fig_caption, fig_text = parsejson.getCaption(filename)
            fig_caption, fig_text, paper_file_name = parsejson.getCaption_noCSV(filename)
            
            figTypeResult = parsejson.isResult(fig_caption)
            figTypeDiag = parsejson.isDiag(fig_caption)
            
            if (not figTypeResult and figTypeDiag):
                im, thresh_im, gray_imcv = preprocessImage(filename, 0)
                binType, mcType = figtypedetector.detectFigType(im)
                    
                if mcType < 3:
                    # print(os.path.join(i2g_output_dir, paper_file_name))
                    if not os.path.isdir(os.path.join(i2g_output_dir, paper_file_name)):
                        os.mkdir(os.path.join(i2g_output_dir, paper_file_name))
                        os.mkdir(os.path.join(i2g_output_dir, paper_file_name, "diag2graph"))
                        os.mkdir(os.path.join(i2g_output_dir, paper_file_name, "Figures"))
    
                    cv2.imwrite(os.path.join(i2g_output_dir, paper_file_name, "Figures", os.path.basename(filename)), im)        
    
    
                    shapedetector = sd()
                    component, flow_dir = shapedetector.find_component(filename, i2g_output_dir, im, thresh_im, gray_imcv)
    
                    textdetector = tda()
                    text_list = textdetector.combinedTextDetect(filename, im, component, fig_text)
    
                    arrowdetector = ad()            
                    line_connect = arrowdetector.detectLines(im, thresh_im, gray_imcv, component, text_list)
    
                    graphcreator = tgv2()
                    graphcreator.createDiag2Graph(i2g_output_dir, filename, im, thresh_im, component, flow_dir, text_list, line_connect, None, paper_file_name, None, None, fig_caption)
    
    #else:
    
        #print("Pdf2Fig Terminated with Status %d. Exiting."% (process.returncode)   )
    
    print("[INFO] Creating RDF graph ...")
    
    runI2G(input_path, entity_map, i2g_output_dir, op_path_all, ontology_file)
    
    print("[Info] Completed image2graph pipeline!")
예제 #2
0
    def createDiag2Graph(self, op_dir, filename, img, thresh_im, comp, flow_dir, text_list, line_list, paper_title, paper_file_name, paper_conf, paper_year, fig_caption):
        # paper_conf and paper_year not used?
        # print(filename)
        op_image_name = os.path.join(op_dir, paper_file_name + "/diag2graph/"+ os.path.basename(filename))
        op_file_name = os.path.join(op_dir, paper_file_name + "/diag2graph/" + os.path.splitext(os.path.basename(filename))[0] + '.txt')
        
        # create intermediate directories:
        if not os.path.exists(os.path.join(op_dir, paper_file_name + "/diag2graph/" )):
            os.makedirs(os.path.join(op_dir, paper_file_name + "/diag2graph/" ))

        op_file = open(op_file_name, "w", encoding="utf-8")
        FigureID = os.path.splitext(os.path.basename(filename))[0] #+"_"+paper_conf+"_"+paper_year
        
        op_file.write(":%s isA Figure \n"% (FigureID))
        if (paper_title):
            op_file.write(":%s foundIn %s \n"% (FigureID, paper_title))
        op_file.write(":%s hasCaption %s \n"% (FigureID, fig_caption))
        final_graph_im = img.copy()
        (compWithText, TextWOComp, compWOText) =  self.getTextCompTag(comp, text_list, line_list)  
        arrowdetector = ad()            
        line_connect = arrowdetector.getLineCompTag(img, thresh_im, comp, line_list, TextWOComp)
        
        node_list = [] 
        #keys = ["nodeId", "layerName", "description", "location", "before", "after"]
        for k in compWithText.keys():
            temp_node = {}
            temp_node['nodeId'] = 'Comp' + str(k)

            op_file.write(":Comp%d partOf :%s \n"% (k, FigureID))

            cnt = comp[k]
            M = cv2.moments(cnt)
            if M["m00"] != 0:
                cX = int(M["m10"] / M["m00"])
                cY = int(M["m01"] / M["m00"])
            else:
                cX, cY = 0, 0
            temp_node['center'] = [cX, cY]    
            temp_node['layerName'] = ""    
            temp_node['description'] = []
            cv2.putText(final_graph_im, str(k), (cX-20, cY-20), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)             
            cv2.drawContours(final_graph_im, [cnt], 0, (255, 0, 255), 2)            
            text_pos = 0

            op_file.write(":Comp%d hasPos %s \n"% (k, str(cv2.boundingRect(comp[k]))))

            for txt in compWithText[k]:
                if not temp_node['layerName']:
                    found, layer_name, remaining_txt = self.find_layerName(txt)
                    
                    if found:
                        op_file.write(":Comp%d isType %s \n"% (k, layer_name))
                        temp_node['layerName'] = layer_name 
                        temp_node['description'].append(txt) 
                    else:
                        temp_node['description'].append(txt) 
                elif (re.search("layer", txt, flags = 0) is None):
                    temp_node['description'].append(txt)               
                                      
                cv2.putText(final_graph_im, txt, (cX, cY + text_pos), cv2.FONT_HERSHEY_SIMPLEX,0.5, (255, 0, 255), 2)
                text_pos +=15
            op_file.write(":Comp%d hasDescription %s \n"% (k, temp_node['description']))    
            node_list.append(temp_node)
           
        regex = re.compile('@_!#$%^&/\~:')
        TextWOComp= self.remove_textDuplicate(TextWOComp)
        for ((startX, startY, endX, endY), op, prob, textID) in TextWOComp:
            if(len(op) > 1 and regex.search(op) == None):
                temp_node = {}
                temp_node['nodeId'] = 'Text' + str(textID)
                temp_node['center'] = [(startX+endX)/2, (startY+endY)/2] 
                temp_node['layerName'] = ""    
                temp_node['description'] = op
                node_list.append(temp_node)
                
                op_file.write(":Text%d partOf :%s \n"% (textID, FigureID))
                op_file.write(":Text%d hasPos %s \n"% (textID, str((startX, startY, endX-startX, endY-startY))))
                op_file.write(":Text%d hasDescription %s \n"% (textID, op.split()))

                cv2.rectangle(final_graph_im, (startX, startY), (endX, endY), (0, 200, 0), 2)
                cv2.putText(final_graph_im, op, (startX+10, startY+10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
             
        
        
        if line_connect:
            followedby_comps = []
            connected_comps = []
            
            for ((startx, starty, endx, endy), start_comp_found, end_comp_found, start_text_found, end_text_found) in line_connect:
                #print("Diag2Graph:  start_comp_found = %s, end_comp_found = %s, start_text_found = %s, end_text_found = %s"%(start_comp_found, end_comp_found, start_text_found, end_text_found))

                if (start_comp_found != -1 and end_comp_found != -1):
                    s, e, conn = self.nodeSequence(start_comp_found, end_comp_found, node_list, start_node_type = "Comp", end_node_type = "Comp") 
                    if conn == "followedBy":
                        followedby_comps.append((s, e, conn)) 
                    elif conn == "connectedTo":
                        connected_comps.append((s, e, conn)) 
                elif (start_text_found != -1 and end_text_found != -1):
                    s, e, conn = self.nodeSequence(start_text_found, end_text_found, node_list, start_node_type = "Text", end_node_type = "Text")
                    if conn == "followedBy":
                        followedby_comps.append((s, e, conn)) 
                    elif conn == "connectedTo":
                        connected_comps.append((s, e, conn))
                elif(start_comp_found != -1 and end_text_found != -1):
                    s, e, conn = self.nodeSequence(start_comp_found, end_text_found, node_list, start_node_type = "Comp", end_node_type = "Text")
                    if conn == "followedBy":
                        followedby_comps.append((s, e, conn)) 
                    elif conn == "connectedTo":
                        connected_comps.append((s, e, conn))
                elif(end_comp_found != -1 and start_text_found != -1):
                    s, e, conn = self.nodeSequence(start_text_found, end_text_found, node_list, start_node_type = "Text", end_node_type = "Comp")
                    if conn == "followedBy":
                        followedby_comps.append((s, e, conn)) 
                    elif conn == "connectedTo":
                        connected_comps.append((s, e, conn))

                

            flow_dir_updated = self.find_dominant_flow(followedby_comps, node_list, flow_dir)

            node_list = self.update_connect_info(connected_comps, node_list, flow_dir_updated)

            op_file.write(":%s hasFlow %s \n"% (FigureID, flow_dir_updated))   
        
        
        cv2.imwrite(op_image_name, final_graph_im)
        op_file.close()       
            
        
예제 #3
0
파일: DiagAnalysis.py 프로젝트: saoruy/DCC
                                             "Figures"))

                        cv2.imwrite(
                            os.path.join(
                                op_path, paper_file_name + "/Figures/" +
                                os.path.basename(filename)), im)

                        shapedetector = sd()
                        component, flow_dir = shapedetector.find_component(
                            filename, op_path, im, thresh_im, gray_imcv)

                        textdetector = tda()
                        text_list = textdetector.combinedTextDetect(
                            filename, im, component, fig_text)

                        arrowdetector = ad()
                        line_connect = arrowdetector.detectLines(
                            im, thresh_im, gray_imcv, component, text_list)

                        graphcreator = tgv2()
                        graphcreator.createDiag2Graph(
                            op_path, filename, im, thresh_im, component,
                            flow_dir, text_list, line_connect, paper_title,
                            paper_file_name, paper_conf, paper_year,
                            fig_caption)
                print(
                    "..........................................................................."
                )

    else:
예제 #4
0
def run(input_path, op_path, ontology_file, model_dir):

    i2g_output_dir = op_path + "/image2graph"

    if not os.path.exists(i2g_output_dir):
        os.makedirs(i2g_output_dir)

    op_path_all = i2g_output_dir + "/all_images"

    if not os.path.exists(op_path_all):
        os.makedirs(op_path_all)

    # command = 'java -cp "pdffigures2_2.12-0.1.0.jar;pdffigures2-assembly-0.1.0-deps.jar;scala-library.jar" org.allenai.pdffigures2.FigureExtractorBatchCli Input/ -s stat_file.json -m out/ -d out/'
    command = 'java -cp "pdffigures2_2.12-0.1.0.jar;pdffigures2-assembly-0.1.0-deps.jar;scala-library.jar" org.allenai.pdffigures2.FigureExtractorBatchCli ' + input_path + '/ -s stat_file.json -m ' + op_path_all + '/ -d ' + op_path_all + '\\'
    # , stderr=subprocess.PIPE
    process = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    #    print(stderr)

    print("[INFO] Loading trained models ...")

    figtypedetector = ftd(model_dir)
    figtypedetector.loadFigClassModels("vgg16")

    #if (process.returncode == 0):
    print("[INFO] Loading and analyzing images ...")

    #    input_path = "Input"
    #    op_path = "out"

    for filename in glob.glob(os.path.join(op_path_all, '*png')):
        #        image_file_name = os.path.splitext(os.path.basename(filename))[0]
        #        abspath = os.path.abspath(filename)
        filename = filename.replace("\\", "/")
        # print("Processing image: " + filename)
        if (filename.find('Figure') != -1):
            parsejson = pj()
            # paper_title, paper_file_name, paper_conf, paper_year, fig_caption, fig_text = parsejson.getCaption(filename)
            fig_caption, fig_text, paper_file_name = parsejson.getCaption_noCSV(
                filename)

            figTypeResult = parsejson.isResult(fig_caption)
            figTypeDiag = parsejson.isDiag(fig_caption)

            #            print(paper_file_name)
            #            print(figTypeResult)
            #            print(figTypeDiag)

            if (not figTypeResult and figTypeDiag):
                im, thresh_im, gray_imcv = preprocessImage(filename, 0)
                binType, mcType = figtypedetector.detectFigType(im)

                if mcType < 3:
                    # print(os.path.join(i2g_output_dir, paper_file_name))
                    if not os.path.isdir(
                            os.path.join(i2g_output_dir, paper_file_name)):
                        os.mkdir(os.path.join(i2g_output_dir, paper_file_name))
                        os.mkdir(
                            os.path.join(i2g_output_dir, paper_file_name,
                                         "diag2graph"))
                        os.mkdir(
                            os.path.join(i2g_output_dir, paper_file_name,
                                         "Figures"))

                    cv2.imwrite(
                        os.path.join(
                            i2g_output_dir, paper_file_name + "/Figures/" +
                            os.path.basename(filename)), im)

                    shapedetector = sd()
                    component, flow_dir = shapedetector.find_component(
                        filename, i2g_output_dir, im, thresh_im, gray_imcv)

                    textdetector = tda()
                    text_list = textdetector.combinedTextDetect(
                        filename, im, component, fig_text)

                    arrowdetector = ad()
                    line_connect = arrowdetector.detectLines(
                        im, thresh_im, gray_imcv, component, text_list)

                    graphcreator = tgv2()
                    # createDiag2Graph(self, op_dir, filename, img, thresh_im, comp, flow_dir, text_list, line_list, paper_title, paper_file_name, paper_conf, paper_year, fig_caption)
                    graphcreator.createDiag2Graph(i2g_output_dir, filename, im,
                                                  thresh_im, component,
                                                  flow_dir, text_list,
                                                  line_connect, None,
                                                  paper_file_name, None, None,
                                                  fig_caption)

    #else:

    #print("Pdf2Fig Terminated with Status %d. Exiting."% (process.returncode)   )

    # image_triple_dir = "C:\\dcc_test\\src\\diagram2graph\\FigAnalysis\\ShapeExtraction\\Output\\"
    # image_output_dir = "C:\\dcc_test\\src\\diagram2graph\\FigAnalysis\\ShapeExtraction\\Output\\"

    print("[INFO] Creating RDF graph ...")

    # paper_dir = "C:/dcc_test/src/diagram2graph/FigAnalysis/ShapeExtraction/Input/"
    # image_triple_dir = "C:/dcc_test/src/diagram2graph/FigAnalysis/ShapeExtraction/out/"
    # image_output_dir = "C:/dcc_test/src/diagram2graph/FigAnalysis/ShapeExtraction/out/"

    runI2G(input_path, i2g_output_dir, op_path_all, ontology_file)

    print("[Info] Completed image2graph pipeline!")


#inputFolder = 'demo_input'
#outputFolder = 'Output_a'
## outputFolderImages = 'Output'
##
#run(inputFolder, outputFolder)
예제 #5
0
파일: Diag2Graph.py 프로젝트: louisccc/DCC
    def createDiag2Graph(self, op_dir, filename, img, thresh_im, comp,
                         text_list, line_list):

        op_image_name = os.path.join(op_dir,
                                     "OpImage/op" + os.path.basename(filename))
        op_file_name = os.path.join(
            op_dir, "OpGraph/graph" +
            os.path.splitext(os.path.basename(filename))[0] + ".txt")
        op_file = open(op_file_name, "w")

        final_graph_im = img.copy()
        (compWithText, TextWOComp,
         compWOText) = self.getTextCompTag(comp, text_list)

        arrowdetector = ad()
        line_connect = arrowdetector.getLineCompTag(img, thresh_im, comp,
                                                    line_list, TextWOComp)

        for k in compWithText.keys():
            cnt = comp[k]
            M = cv2.moments(cnt)
            if M["m00"] != 0:
                cX = int(M["m10"] / M["m00"])
                cY = int(M["m01"] / M["m00"])
            else:
                cX, cY = 0, 0

            cv2.putText(final_graph_im, str(k), (cX - 20, cY - 20),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            cv2.drawContours(final_graph_im, [cnt], 0, (255, 0, 255), 2)
            print("=======+++++++++++++++++++++=======")
            text_pos = 0
            for txt in compWithText[k]:
                found, layer_name, remaining_txt = self.find_layerName(txt)
                if found:
                    print("Component number %d \"has type\": %s \n" %
                          (k, layer_name))
                    op_file.write("Component number %d \"has type\": %s \n" %
                                  (k, layer_name))
                    print("Component number %d \"has description\": %s \n" %
                          (k, remaining_txt))
                    op_file.write(
                        "Component number %d \"has description\": %s \n" %
                        (k, remaining_txt))

                else:
                    print("Component number %d \"has description\": %s \n" %
                          (k, txt))
                    op_file.write(
                        "Component number %d \"has description\": %s \n" %
                        (k, txt))

                cv2.putText(final_graph_im, txt, (cX, cY + text_pos),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 255), 2)
                # cv2.imshow("final_graph_im", final_graph_im)
                # cv2.waitKey(0)
                text_pos += 15

        for index in compWOText:  #for k in results_wotext:
            print(
                "----------------Component number %d has text: None --------------------"
                % (index))
            op_file.write("Component number %d has text: None \n" % (index))

            cnt = comp[index]
            M = cv2.moments(cnt)
            if M["m00"] != 0:
                cX = int(M["m10"] / M["m00"])
                cY = int(M["m01"] / M["m00"])
            else:
                cX, cY = 0, 0

            cv2.drawContours(final_graph_im, [cnt], 0, (0, 0, 255), 2)
            cv2.putText(final_graph_im, str(index), (cX, cY),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            # cv2.imshow("final_graph_im", final_graph_im)
            # cv2.waitKey(0)

        for ((startX, startY, endX, endY), op, prob) in TextWOComp:
            print(
                "................ Text without Component: %s....................."
                % (op))
            op_file.write("Text Component: %s \n" % (op))

            cv2.rectangle(final_graph_im, (startX, startY), (endX, endY),
                          (0, 200, 0), 2)
            cv2.putText(final_graph_im, op, (startX + 10, startY + 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
            # cv2.imshow("final_graph_im", final_graph_im)
            # cv2.waitKey(0)

        for ((startx, starty, endx, endy), start_comp_found, end_comp_found,
             start_text_found, end_text_found) in line_connect:
            if (start_comp_found != -1 and end_comp_found != -1):
                print(
                    "Component number %d \"connected to\" Component number %d \n"
                    % (start_comp_found, end_comp_found))
                op_file.write(
                    "Component number %d \"connected to\" Component number %d \n"
                    % (start_comp_found, end_comp_found))
            elif (start_text_found != -1 and end_text_found != -1):
                print("Text Box %s \"connected to\" TextBox %s \n" %
                      (TextWOComp[start_text_found][1],
                       TextWOComp[end_text_found][1]))
                op_file.write("Text %s \"connected to\" Text %s \n" %
                              (TextWOComp[start_text_found][1],
                               TextWOComp[end_text_found][1]))
            elif (start_comp_found != -1 and end_text_found != -1):
                print("Component Number %d \"connected to\" TextBox %s \n" %
                      (start_comp_found, TextWOComp[end_text_found][1]))
                op_file.write(
                    "Component Number %d \"connected to\" Text %s \n" %
                    (start_comp_found, TextWOComp[end_text_found][1]))
            elif (end_comp_found != -1 and start_text_found != -1):
                print("Component Number %d \"connected to\" TextBox %s \n" %
                      (end_comp_found, TextWOComp[start_text_found][1]))
                op_file.write(
                    "Component Number %d \"connected to\" Text %s \n" %
                    (end_comp_found, TextWOComp[start_text_found][1]))

            cv2.line(final_graph_im, (startx, starty), (endx, endy),
                     (0, 200, 255), 2)
            # cv2.imshow("final_graph_im", final_graph_im)
            # cv2.waitKey(0)

        cv2.imwrite(op_image_name, final_graph_im)
        op_file.close()
        print(
            "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
        )