def readPlainText(htmlurl, plaintext, ontologyData, _encoding): try: try: punc = (plaintext.decode(_encoding)).strip() except: try: punc = (plaintext.decode(sys.stdout.encoding)).strip() except: try: punc = (plaintext.decode('UTF-8')).strip() except: try: punc = (plaintext.decode('latin-1')).strip() except: try: punc = (plaintext.decode('ISO-8859-1')).strip() except: try: punc = (plaintext.decode()).strip() except: punc = plaintext pass sentences = comm.replaceToPunkts(punc) for sentence in sentences: getEntities.getEntities(htmlurl, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_plaintext.py " + _encoding + " " + htmlurl)
def readExcel(filePath, url, ontologyData): try: urr(url, filePath) try: workbook = xlrd.open_workbook(filePath) worksheets = workbook.sheet_names() for worksheet_name in worksheets: worksheet = workbook.sheet_by_name(worksheet_name) num_rows = worksheet.nrows - 1 num_cells = worksheet.ncols - 1 curr_row = -1 while curr_row < num_rows: curr_row += 1 #row = worksheet.row(curr_row) #print ('Row:', curr_row) curr_cell = -1 while curr_cell < num_cells: curr_cell += 1 # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank cell_type = worksheet.cell_type(curr_row, curr_cell) cell_value = worksheet.cell_value(curr_row, curr_cell) if (cell_type == 1): sentences = comm.replaceToPunkts(cell_value) for sentence in sentences: getEntities.getEntities(url, sentence, ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass except: comm.printException(comm.pathToSaveParsingErrors, "read_excel.py " + url) pass
def readPdf(filePath, url, od): urldownl(url, filePath) pdf = PdfFileReader(open(filePath, "rb")) pdf.strict = True try: for page in pdf.pages: text = (page.extractText()) sentences = comm.replaceToPunkts(text) for sentence in sentences: getEntities.getEntities(url, sentence, od) except: comm.printException(comm.pathToSaveParsingErrors, "read_pdf.py " + url) pass
def readHtmlPage(htmlurl, readedPage, ontologyData, _encoding): try: sentences = set() root = parse(htmlurl).getroot() if (root is not None): for element in root.iter("head"): element.drop_tree() for element in root.iter("script"): element.drop_tree() for element in root.iter("style"): element.drop_tree() for element in root.iter("noscript"): element.drop_tree() for element in root.iter("input"): element.drop_tree() for element in root.iter("form"): element.drop_tree() for element in root.iter("title"): element.drop_tree() for element in root.iter("img"): element.drop_tree() for element in root.iter("body"): try: sentences.add(element.text_content()) except: pass if (len(sentences) > 0): lsent = list(sentences) for lau in lsent: if (lau != ""): laused = comm.replaceToPunkts(lau) for s6ne in laused: getEntities.getEntities(htmlurl, s6ne.strip(), ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_html.py " + _encoding + " " + htmlurl) pass
def readHtmlPage(htmlurl, readedPage, ontologyData, _encoding): try: sentences = set() root = parse(htmlurl).getroot() if (root is not None): for element in root.iter("head"): element.drop_tree() for element in root.iter("script"): element.drop_tree() for element in root.iter("style"): element.drop_tree() for element in root.iter("noscript"): element.drop_tree() for element in root.iter("input"): element.drop_tree() for element in root.iter("form"): element.drop_tree() for element in root.iter("title"): element.drop_tree() for element in root.iter("img"): element.drop_tree() for element in root.iter("body"): try: sentences.add(element.text_content()) except: pass if(len(sentences) > 0): lsent = list(sentences) for lau in lsent: if(lau != ""): laused = comm.replaceToPunkts(lau) for s6ne in laused: getEntities.getEntities(htmlurl, s6ne.strip(), ontologyData) except: comm.printException(comm.pathToSaveParsingErrors, "read_html.py " + _encoding + " " + htmlurl) pass
def startToGetEntities(jsonurl, lause, ontologyData): sentences = comm.replaceToPunkts(lause) for sentence in sentences: getEntities.getEntities(jsonurl, sentence, ontologyData)