conn = urllib2.urlopen(url) webfile = conn.read() except Exception, e: logger.info("Cannot download URL:%s\t%s", url, e) else: if not webfile: return result converted = UnicodeDammit(webfile) #, isHTML=True) if not converted.unicode: logger.info("UnicodeDammit failed to detect encoding, tried [%s]", \ ', '.join(converted.triedEncodings)) return result logger.debug("UnicodeDammit: originalEncoding:%s, triedEncodings:%s", converted.originalEncoding, ', '.join(converted.triedEncodings)) result['raw'] = converted.unicode result['text'] = bte.html2text(converted.unicode) root = None try: root = lxml.html.fromstring(webfile) except lxml.etree.ParserError, e: logger.info("Can not parse URL:%s\t%s", url, e) return dict() find = {'description' : "./head/meta[@name=\"description\"]/@content", 'keywords' : "./head/meta[@name=\"keywords\"]/@content", 'title' : "./head/title/text()", 'lang' : "./html[@name=\"lang\"]"} for key, value in find.iteritems(): try : result[key] = root.xpath(value)[0] except UnicodeDecodeError, e: logger.info("UnicodeDecodeError\t%s", e)
def get_bte_content(wr): encoding = chardet.detect(wr.content)['encoding'] content = wr.content.decode(encoding, errors='replace') paragraphs = bte.html2text(content).split('\n') return paragraphs
output_file = io.open(sys.argv[3], "w+", encoding="utf-8") contents = input_file.read() individual_docs = contents.split("<DOC>") counter = 0 stringBuilder = [] count = 0 for doc in individual_docs[1:]: sections = doc.split("<CSDESCRIPTION>") header = "<DOC>" + sections[0] + "<CSDESCRIPTION>" print(sections[0]) remaining = sections[1] sections = remaining.split("</CSDESCRIPTION>") extracted = "" if technique == 'bte' and len(sections[0]) > 0: extracted = html2text(sections[0], False, False) elif technique == 'justext' and len(sections[0]) > 1: paragraphs = justext.justext(sections[0], justext.get_stoplist("English")) for paragraph in paragraphs: if not paragraph.is_boilerplate: extracted += paragraph.text elif technique == 'goose' and len(sections[0]) > 1: g = Goose({"enable_image_fetching": False}) extracted = g.extract(raw_html=sections[0]).cleaned_text footer = "</CSDESCRIPTION>" + sections[1] stringBuilder.append((header + extracted + footer))