def getURL(self, url, uid): if not os.path.isfile(str(uid) + ".html"): with open(str(uid) + ".html", "w") as fptr: try: response = self.opener.open(url) except: return None data = response.read() try: decoded = UnicodeDammit( gzip.GzipFile(fileobj=io.BytesIO(data)).read(), ["windows-1252"], smart_quotes_to="html").unicode_markup except: decoded = UnicodeDammit( data, ["windows-1252"], smart_quotes_to="html").unicode_markup decoded = decoded.replace(u"%20", u" ").replace( u"\u00c2", u" ").replace(u"\xe2€™", u"\'").replace( u"\xe2€œ", u"\"").replace(u"\xe2€", "\"").replace(u"\"“", "-") #.replace(u"\xe2\x80\x9c", u"\"").replace(u"\xe2\x80\x9d", u"\"").replace(u"\xc3\xb3", u"\u00f3").replace(u"\xc3\xad", u"\u00ed").replace(u"\xe2\x20\xac\x21\x22", u"\'").replace(u"\xe2\x20\xac\x01\x53", u"\"").replace(u"\xe2\x20\xac", u"\"").replace(u"\xe2\x20\xac\x20\x1c", u" - ").replace(u"\xc3", u"\u00e9").replace(u"\x00\xc2", u" ") print >> fptr, decoded.encode('utf8') parser = etree.HTMLParser(target=Parser()) with open(str(uid) + ".html", 'r') as fptr: data = fptr.read() parser.feed(data.decode('utf8')) return parser.close()
def clean_tabs(body): converted = UnicodeDammit(body).unicode_markup converted = converted.replace("\n\n", " ") converted = converted.replace("\\n", " ") converted = converted.replace("\n", " ") converted = converted.replace("\\r", " ") converted = converted.replace("\r\r", " ") converted = converted.replace("\r\r", " ") converted = converted.replace("<br>", " ") converted = converted.replace("</a>", " ") converted = converted.replace("<a href=", " ") converted_cleaned = converted.replace("\r", " ") return converted_cleaned
def getContent(soup, source=''): newContent = [] # Cleanning phase genericCleaning(soup) sourceSpecificcleaning(soup, source) # f = open("content.html", 'w'); f.write(soup.prettify().encode('utf-8')); f.close(); # Finding content in the tree bestElem = None bestText = '' for el in soup.findAll(True): score = 0.0 hasTitle = False if el.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7' ] and el.parent.name == '[document]': score += 3 for c in el: if c.name == 'br': # business insider style score += 0.5 if c.name == 'p': score += 1.0 if not hasTitle and c.name in [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7' ]: score += 1.0 hasTitle = True if score >= 3.0: # at least 3 paragraphs textOutput = getText(el) if float( len(textOutput) ) / score > 20.0: # we need at least 20 characters per container newContent.append(textOutput) elif score >= 1.0: if bestElem is None: bestElem = el bestText = getText(el, False) else: a = getText(el, False) if bestElem is None or len(a) > len(bestText): bestElem = el bestText = a if len( newContent ) == 0 and bestElem is not None: # in case nothing had a score of 3, but something had a score of 1 or more newContent.append(bestText) finalText = UnicodeDammit(u'\n'.join(newContent), smart_quotes_to='ascii').unicode_markup return finalText.replace('\n\n', '\n')
def remove_evernote_link(link, html): html = UnicodeDammit(html, ['utf-8'], is_html=True).unicode_markup link_converted = UnicodeDammit(link.WholeRegexMatch, ['utf-8'], is_html=True).unicode_markup sep = u'<span style="color: rgb(105, 170, 53);"> | </span>' sep_regex = escape_regex(sep) no_start_tag_regex = r'[^<]*' regex_replace = r'<{0}[^>]*>[^<]*{1}[^<]*</{0}>' # html = re.sub(regex_replace.format('li', link.WholeRegexMatch), "", html) # Remove link html = html.replace(link.WholeRegexMatch, "") # Remove empty li html = re.sub(regex_replace.format('li', no_start_tag_regex), "", html) # Remove dangling separator regex_span = regex_replace.format('span', no_start_tag_regex) + no_start_tag_regex + sep_regex html = re.sub(regex_span, "", html) # Remove double separator html = re.sub(sep_regex + no_start_tag_regex + sep_regex, sep_regex, html) return html
def remove_evernote_link(link, html): html = UnicodeDammit(html, ["utf-8"], is_html=True).unicode_markup link_converted = UnicodeDammit(link.WholeRegexMatch, ["utf-8"], is_html=True).unicode_markup sep = u'<span style="color: rgb(105, 170, 53);"> | </span>' sep_regex = escape_regex(sep) no_start_tag_regex = r"[^<]*" regex_replace = r"<{0}[^>]*>[^<]*{1}[^<]*</{0}>" # html = re.sub(regex_replace.format('li', link.WholeRegexMatch), "", html) # Remove link html = html.replace(link.WholeRegexMatch, "") # Remove empty li html = re.sub(regex_replace.format("li", no_start_tag_regex), "", html) # Remove dangling separator regex_span = regex_replace.format("span", no_start_tag_regex) + no_start_tag_regex + sep_regex html = re.sub(regex_span, "", html) # Remove double separator html = re.sub(sep_regex + no_start_tag_regex + sep_regex, sep_regex, html) return html
def getContent(soup, source=''): newContent = [] # Cleanning phase genericCleaning(soup) sourceSpecificcleaning(soup, source) # f = open("content.html", 'w'); f.write(soup.prettify().encode('utf-8')); f.close(); # Finding content in the tree bestElem = None; bestText = ''; for el in soup.findAll(True): score = 0.0; hasTitle = False if el.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7'] and el.parent.name == '[document]': score += 3 for c in el: if c.name == 'br': # business insider style score += 0.5 if c.name == 'p': score += 1.0 if not hasTitle and c.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7']: score += 1.0 hasTitle = True if score >= 3.0: # at least 3 paragraphs textOutput = getText(el) if float(len(textOutput))/score > 20.0: # we need at least 20 characters per container newContent.append(textOutput) elif score >= 1.0: if bestElem is None: bestElem = el; bestText = getText(el, False) else: a = getText(el, False) if bestElem is None or len(a) > len(bestText): bestElem = el; bestText = a if len(newContent) == 0 and bestElem is not None: # in case nothing had a score of 3, but something had a score of 1 or more newContent.append(bestText) finalText = UnicodeDammit(u'\n'.join(newContent), smart_quotes_to='ascii').unicode_markup return finalText.replace('\n\n', '\n')
# (or default to 'config.ini') parser = argparse.ArgumentParser(description='Create players for past ' 'DF Game Club sessions') parser.add_argument('config', nargs='?', default='config.ini', help='configuration file with past session data') namespace = parser.parse_args() # Read configuration file conf = configparser.ConfigParser(interpolation=None) conf.read(namespace.config) for section in conf.sections(): pastebin_url = PASTEBIN_RAW_PREFIX + \ urlparse(conf[section]["pastebin_url"]).path.strip('/') log = UnicodeDammit(urlopen(pastebin_url).read()).unicode_markup log = log.replace('\r', '').split('\n') regex = re.compile(conf[section]['regex']) timestamp_format = conf[section]['timestamp_format'] video_timestamp = conf[section]['video_timestamp'] twitch_url = urlparse(conf[section]['twitch_url'])\ .path.strip('/').split('/') channel, archive_id = twitch_url[0], twitch_url[2] ignore_lines = [ int(num) for num in [ item for item in conf[section]['ignore_lines'].replace(' ','').split(',') if item != '' ]