def spreadURLsByContentType(url, httpResponse, tyyp, od, _encoding, filePath = None): #od = initRdf.OntologyData('/var/www/html/mag/rdf_files/') #initRdf.RdfFilesCreator(od) doctext = httpResponse#detectEncoding(_encoding, httpResponse) '''#parse excel file''' if("excel" in tyyp.lower()): '''#parse web page excel''' read_eksel.readExcel(filePath, url, od) elif("xml" in tyyp.lower()): #print(tyyp) '''#parse web page xml''' doctext = detectEncoding(_encoding, httpResponse) read_xml.readXml(url, doctext, od) elif("html" in tyyp.lower()) : '''#parse web page html/txt''' doctext = detectEncoding(_encoding, httpResponse) read_html.readHtmlPage(url, doctext, od) elif("json" in tyyp.lower()): '''#parse json app/json''' doctext = detectEncoding(_encoding, httpResponse) read_json.readJson(url, doctext, od) elif("pdf" in tyyp.lower()): '''#parse pdf''' read_pdf.readPdf(url, doctext, od) elif("plain" in tyyp.lower()) or ("text" in tyyp.lower()): doctext = detectEncoding(_encoding, httpResponse) '''#assumes incoming is plain text try to parse text lines''' read_plaintext.readPlainText(url, doctext, od) else: jf = open(comm.pathToSaveParsingErrors, 'a', encoding='utf-8') jf.write(time.strftime("%d/%m/%Y_%H:%M:%S") + " " + url + " " + "The_parser_for_the_type_" + tyyp + "_is_not_implemented\n") jf.close()
def spreadURLsByContentType(url, httpResponse, tyyp, od, _encoding, filePath = None): doctext = httpResponse '''#parse excel file''' if("excel" in tyyp): try: '''#parse web page excel''' read_eksel.readExcel(filePath, url, od) except: comm.printException(comm.pathToSaveParsingErrors, "fileparser_excel") pass elif("pdf" in tyyp): try: '''#parse pdf''' read_pdf.readPdf(filePath, url, od) except: comm.printException(comm.pathToSaveParsingErrors, "fileparser_pdf") pass elif("xml" in tyyp): try: '''#parse web page xml''' doctext = detectEncoding(_encoding, httpResponse) read_xml.readXml(url, doctext, od) except: comm.printException(comm.pathToSaveParsingErrors, "fileparser_xml") pass elif("html" in tyyp) : try: '''#parse web page html/txt''' doctext = detectEncoding(_encoding, httpResponse) read_html.readHtmlPage(url, doctext, od, _encoding) except: comm.printException(comm.pathToSaveParsingErrors, "fileparser_html") pass elif("json" in tyyp): try: '''#parse json app/json''' doctext = detectEncoding(_encoding, httpResponse) read_json.readJson(url, doctext, od, _encoding) except: comm.printException(comm.pathToSaveParsingErrors, "fileparser_json") pass elif("plain" in tyyp) or ("text" in tyyp): try: doctext = detectEncoding(_encoding, httpResponse) '''#assumes incoming is plain text try to parse text lines''' read_plaintext.readPlainText(url, doctext, od, _encoding) except: comm.printException(comm.pathToSaveParsingErrors, "fileparser_plainText") pass else: comm.printException(comm.pathToSaveParsingErrors, "The_parser_for_the_type_" + tyyp + "_is_not_implemented\n")
def spreadURLsByContentType(url, httpResponse, tyyp, od, _encoding, filePath=None): #od = initRdf.OntologyData('/var/www/html/mag/rdf_files/') #initRdf.RdfFilesCreator(od) doctext = httpResponse #detectEncoding(_encoding, httpResponse) '''#parse excel file''' if ("excel" in tyyp.lower()): '''#parse web page excel''' read_eksel.readExcel(filePath, url, od) elif ("xml" in tyyp.lower()): #print(tyyp) '''#parse web page xml''' doctext = detectEncoding(_encoding, httpResponse) read_xml.readXml(url, doctext, od) elif ("html" in tyyp.lower()): '''#parse web page html/txt''' doctext = detectEncoding(_encoding, httpResponse) read_html.readHtmlPage(url, doctext, od) elif ("json" in tyyp.lower()): '''#parse json app/json''' doctext = detectEncoding(_encoding, httpResponse) read_json.readJson(url, doctext, od) elif ("pdf" in tyyp.lower()): '''#parse pdf''' read_pdf.readPdf(url, doctext, od) elif ("plain" in tyyp.lower()) or ("text" in tyyp.lower()): doctext = detectEncoding(_encoding, httpResponse) '''#assumes incoming is plain text try to parse text lines''' read_plaintext.readPlainText(url, doctext, od) else: jf = open(comm.pathToSaveParsingErrors, 'a', encoding='utf-8') jf.write( time.strftime("%d/%m/%Y_%H:%M:%S") + " " + url + " " + "The_parser_for_the_type_" + tyyp + "_is_not_implemented\n") jf.close()