Beispiel #1
0
def readPlainText(htmlurl, plaintext, ontologyData, _encoding):
    try:
        try:
            punc = (plaintext.decode(_encoding)).strip()
        except:
            try:
                punc = (plaintext.decode(sys.stdout.encoding)).strip()
            except:
                try:
                    punc = (plaintext.decode('UTF-8')).strip()
                except:
                    try:
                        punc = (plaintext.decode('latin-1')).strip()
                    except:
                        try:
                            punc = (plaintext.decode('ISO-8859-1')).strip()
                        except:
                            try:
                                punc = (plaintext.decode()).strip()
                            except:
                                punc = plaintext
                                pass

        sentences = comm.replaceToPunkts(punc)
        for sentence in sentences:
            getEntities.getEntities(htmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors,
                            "read_plaintext.py " + _encoding + " " + htmlurl)
Beispiel #2
0
def readPlainText(htmlurl, plaintext, ontologyData, _encoding):
    try:
        try:
            punc = (plaintext.decode(_encoding)).strip()
        except:
            try:
                punc = (plaintext.decode(sys.stdout.encoding)).strip()
            except:
                try:
                    punc = (plaintext.decode('UTF-8')).strip()
                except:
                    try:
                        punc = (plaintext.decode('latin-1')).strip() 
                    except:
                        try:
                            punc = (plaintext.decode('ISO-8859-1')).strip()  
                        except:
                            try:
                                punc = (plaintext.decode()).strip()  
                            except:
                                punc = plaintext
                                pass 
                        
        sentences = comm.replaceToPunkts(punc)
        for sentence in sentences:
                getEntities.getEntities(htmlurl, sentence, ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_plaintext.py " + _encoding + " " + htmlurl)
Beispiel #3
0
def readExcel(filePath, url, ontologyData):
    try:
        urr(url, filePath)
        try:
            workbook = xlrd.open_workbook(filePath)
            worksheets = workbook.sheet_names()
            for worksheet_name in worksheets:
                worksheet = workbook.sheet_by_name(worksheet_name)
                num_rows = worksheet.nrows - 1
                num_cells = worksheet.ncols - 1
                curr_row = -1
                while curr_row < num_rows:
                    curr_row += 1
                    #row = worksheet.row(curr_row)
                    #print ('Row:', curr_row)
                    curr_cell = -1
                    while curr_cell < num_cells:
                        curr_cell += 1
                        # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
                        cell_type = worksheet.cell_type(curr_row, curr_cell)
                        cell_value = worksheet.cell_value(curr_row, curr_cell)
                        if (cell_type == 1):
                            sentences = comm.replaceToPunkts(cell_value)
                            for sentence in sentences:
                                getEntities.getEntities(url, sentence, ontologyData)
        
        except:
            comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
            pass
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url)
        pass
Beispiel #4
0
def readPdf(filePath, url, od):
    urldownl(url, filePath)
    pdf = PdfFileReader(open(filePath, "rb"))
    pdf.strict = True
 
    try:
        for page in pdf.pages:
            text = (page.extractText())
            sentences = comm.replaceToPunkts(text)
            for sentence in sentences:
                getEntities.getEntities(url, sentence, od)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url)
        pass
Beispiel #5
0
def readPdf(filePath, url, od):
    urldownl(url, filePath)
    pdf = PdfFileReader(open(filePath, "rb"))
    pdf.strict = True

    try:
        for page in pdf.pages:
            text = (page.extractText())
            sentences = comm.replaceToPunkts(text)
            for sentence in sentences:
                getEntities.getEntities(url, sentence, od)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url)
        pass
Beispiel #6
0
def readHtmlPage(htmlurl, readedPage, ontologyData, _encoding):
    try:
        sentences = set()
        root = parse(htmlurl).getroot()
        if (root is not None):
            for element in root.iter("head"):
                element.drop_tree()
            for element in root.iter("script"):
                element.drop_tree()
            for element in root.iter("style"):
                element.drop_tree()
            for element in root.iter("noscript"):
                element.drop_tree()
            for element in root.iter("input"):
                element.drop_tree()
            for element in root.iter("form"):
                element.drop_tree()
            for element in root.iter("title"):
                element.drop_tree()
            for element in root.iter("img"):
                element.drop_tree()

            for element in root.iter("body"):
                try:
                    sentences.add(element.text_content())
                except:
                    pass
            if (len(sentences) > 0):
                lsent = list(sentences)
                for lau in lsent:
                    if (lau != ""):
                        laused = comm.replaceToPunkts(lau)
                        for s6ne in laused:
                            getEntities.getEntities(htmlurl, s6ne.strip(),
                                                    ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors,
                            "read_html.py " + _encoding + " " + htmlurl)
        pass
Beispiel #7
0
def readHtmlPage(htmlurl, readedPage, ontologyData, _encoding):
    try:
        sentences = set()
        root = parse(htmlurl).getroot()
        if (root is not None):
            for element in root.iter("head"):
                element.drop_tree()
            for element in root.iter("script"):
                element.drop_tree()
            for element in root.iter("style"):
                element.drop_tree()
            for element in root.iter("noscript"):
                element.drop_tree()
            for element in root.iter("input"):
                element.drop_tree()
            for element in root.iter("form"):
                element.drop_tree()
            for element in root.iter("title"):
                element.drop_tree()
            for element in root.iter("img"):
                element.drop_tree()
                
            for element in root.iter("body"):
                try:
                    sentences.add(element.text_content())
                except:
                    pass
            if(len(sentences) > 0): 
                lsent = list(sentences)
                for lau in lsent:
                    if(lau != ""):
                        laused = comm.replaceToPunkts(lau)
                        for s6ne in laused:
                            getEntities.getEntities(htmlurl, s6ne.strip(), ontologyData)
    except:
        comm.printException(comm.pathToSaveParsingErrors, "read_html.py " + _encoding + " " + htmlurl)
        pass
Beispiel #8
0
def startToGetEntities(jsonurl, lause, ontologyData):
    sentences = comm.replaceToPunkts(lause)
    for sentence in sentences:
        getEntities.getEntities(jsonurl, sentence, ontologyData)
Beispiel #9
0
def startToGetEntities(jsonurl, lause, ontologyData):
    sentences = comm.replaceToPunkts(lause)
    for sentence in sentences:
        getEntities.getEntities(jsonurl, sentence, ontologyData)