コード例 #1
0
     conn = urllib2.urlopen(url)
     webfile = conn.read()
 except Exception, e: 
     logger.info("Cannot download URL:%s\t%s", url, e)
 else:
     if not webfile:
         return result
     converted = UnicodeDammit(webfile) #, isHTML=True)
     if not converted.unicode:
         logger.info("UnicodeDammit failed to detect encoding, tried [%s]", \
              ', '.join(converted.triedEncodings))
         return result
     logger.debug("UnicodeDammit: originalEncoding:%s, triedEncodings:%s",
              converted.originalEncoding, ', '.join(converted.triedEncodings))
     result['raw'] = converted.unicode
     result['text'] = bte.html2text(converted.unicode)
     root = None
     try:
         root = lxml.html.fromstring(webfile)
     except lxml.etree.ParserError, e:
         logger.info("Can not parse URL:%s\t%s", url, e)
         return dict()
     find = {'description' : "./head/meta[@name=\"description\"]/@content",
             'keywords' : "./head/meta[@name=\"keywords\"]/@content",
             'title' : "./head/title/text()",
             'lang'  : "./html[@name=\"lang\"]"}
     for key, value in find.iteritems():
         try : 
             result[key] = root.xpath(value)[0]
         except UnicodeDecodeError, e: 
             logger.info("UnicodeDecodeError\t%s", e)
def get_bte_content(wr):
    encoding = chardet.detect(wr.content)['encoding']
    content = wr.content.decode(encoding, errors='replace')
    paragraphs = bte.html2text(content).split('\n')
    return paragraphs
コード例 #3
0
output_file = io.open(sys.argv[3], "w+", encoding="utf-8")

contents = input_file.read()
individual_docs = contents.split("<DOC>")
counter = 0
stringBuilder = []
count = 0
for doc in individual_docs[1:]:
    sections = doc.split("<CSDESCRIPTION>")
    header = "<DOC>" + sections[0] + "<CSDESCRIPTION>"
    print(sections[0])
    remaining = sections[1]
    sections = remaining.split("</CSDESCRIPTION>")
    extracted = ""
    if technique == 'bte' and len(sections[0]) > 0:
        extracted = html2text(sections[0], False, False)

    elif technique == 'justext' and len(sections[0]) > 1:
        paragraphs = justext.justext(sections[0],
                                     justext.get_stoplist("English"))
        for paragraph in paragraphs:
            if not paragraph.is_boilerplate:
                extracted += paragraph.text

    elif technique == 'goose' and len(sections[0]) > 1:
        g = Goose({"enable_image_fetching": False})
        extracted = g.extract(raw_html=sections[0]).cleaned_text

    footer = "</CSDESCRIPTION>" + sections[1]

    stringBuilder.append((header + extracted + footer))