def most_repeated(data): michael_distinct_count = utils.word_count(' '.join(data["michael"])) dwight_distinct_count = utils.word_count(' '.join(data["dwight"])) michael_distinct_count = dict( sorted(michael_distinct_count.items(), key=lambda item: item[1], reverse=True)) dwight_distinct_count = dict( sorted(dwight_distinct_count.items(), key=lambda item: item[1], reverse=True)) dwight_most_repeated = [] michael_most_repeated = [] for word, count in michael_distinct_count.items(): if word not in dwight_distinct_count.keys(): michael_most_repeated.append((word, count)) if len(michael_most_repeated) >= 10: break for word, count in dwight_distinct_count.items(): if word not in michael_distinct_count.keys(): dwight_most_repeated.append((word, count)) if len(dwight_most_repeated) >= 10: break print(dwight_most_repeated) print(michael_most_repeated) utils.plot([x[0] for x in dwight_most_repeated], [x[1] for x in dwight_most_repeated], "Dwight", True) utils.plot([x[0] for x in michael_most_repeated], [x[1] for x in michael_most_repeated], "Michael", True) return dwight_most_repeated, michael_most_repeated
def compute_RNF(docA, docB): wc_A = utils.word_count(docA) wc_B = utils.word_count(docB) total_A = sum([value for value in wc_A.values()]) total_B = sum([value for value in wc_B.values()]) RNF = {} for word in wc_A.keys(): if word not in wc_B.keys(): continue RNF[word] = (wc_A[word] / total_A) / (wc_B[word] / total_B) return dict(sorted(RNF.items(), key=lambda item: item[1], reverse=True))
def is_bad_node(self, node): text = node.text_content().strip() if node.tag.lower() in 'img|br': return False if not text and not node.getchildren(): return True for img in node.xpath('.//img'): if self.title in img.get('alt', '') \ or self.title in img.get('title', ''): return False text_len = word_count(text) link_len, link_cnt = 0, 0 for link in node.findall('.//a'): link_cnt += 1 if not link.text_content(): continue link_len += word_count(link.text_content()) if link_cnt > 1 and text_len > 1 and link_len / float(text_len) > 0.4: return True if link_cnt > 1 and text_len / float(link_cnt) < 10: return True if link_cnt > 1 and node.cssselect('li a'): return True block_cnt = len(node.xpath(BAD_XPATH)) if link_cnt > 0 and block_cnt > 1 and len(node.cssselect('pre')) == 0: return True if text_len / float(self.len + 1) < 0.15 or text_len < 100: if re.search('\d{3,}-\d+-\d+', text): return True # filterRe = re.compile(u'点击(.*)(进入|观看)|^事实\+$') # if filterRe.match(text): # return True return False
def post_html(contents, title, permalink, taglist, stream_only, metadata, scrutinize = True, allow_comments = True, Patreon_type = "blog"): head = [] post_content = blog_server_shared.postprocess_post_string(contents, metadata["id"], title, False, scrutinize)[0] head.append ("<script>window.elidupree.handle_content_warnings ('"+ metadata ["id"]+"', false)</script>" ) next_transcript_number = 1 while True: transcript_generator = re.search(r"<transcript"+ blog_server_shared.grouped_string_regex("transcript_text")+">", post_content, re.DOTALL) if transcript_generator is None: break transcript_identifier_string = str(next_transcript_number)+'_'+ metadata ["id"] post_content = post_content [0: transcript_generator.start(0)]+'<div id="transcript_'+ transcript_identifier_string+'" class="transcript_block"><div class="transcript_header">Transcript: <a id="show_transcript_button_'+ transcript_identifier_string+'" href="javascript:;">(show)</a><a id="hide_transcript_button_'+ transcript_identifier_string+'" href="javascript:;">(hide)</a></div><div class="transcript_content id'+ transcript_identifier_string+'">'+ transcript_generator.group("transcript_text")+'</div></div>' + post_content [transcript_generator.end(0):] head.append('''<style> html.transcript_hidden_'''+ transcript_identifier_string +''' div.transcript_content.id'''+ transcript_identifier_string +''' {display: none;} #show_transcript_button_'''+ transcript_identifier_string +''' {display: none;} html.transcript_hidden_'''+ transcript_identifier_string +''' #show_transcript_button_'''+ transcript_identifier_string +''' {display: inline;} html.transcript_hidden_'''+ transcript_identifier_string +''' #hide_transcript_button_'''+ transcript_identifier_string +''' {display: none;} </style> <script> window.elidupree.handle_transcript ("'''+ transcript_identifier_string +'''"); </script>''') next_transcript_number = next_transcript_number + 1 if stream_only == True: cutter = re. compile ( r"<cut>.*?</p>.*$", re.DOTALL) post_content = cutter.sub ('''[...]</p> <a class="continue_reading" href="'''+ permalink +'''">Continue reading<span class="invisible"> '''+ title +'''</span>...</a>''', post_content) #this sometimes cuts off anchors, so make sure fragments point at the canonical URL post_content = re.sub ('href="#','href="' + permalink + '#', post_content) else: post_content = re.sub ("<cut>", "", post_content) calculate_readability = (stream_only != True) if calculate_readability: #using the automated readability index reference = re.sub(r"\s+", " ", html.unescape (utils.strip_tags (post_content))) sentences = len(re.findall (r"\w\w\w.*?[.?!]", reference)) words = utils.word_count (reference) characters = len(re.findall (r"\w", reference)) if words >0 and sentences >0: readability = 4.71*characters/words +0.5 *words/sentences -21.43 post_content = post_content + '<em class="debug"> Approximate readability: ' + "{:.2f}".format (readability) + " ("+ str (characters) + " characters, " + str (words) + " words, " + str (sentences) + " sentences, " + "{:.2f}".format (characters/words) + " characters per word, " + "{:.2f}".format (words/sentences) + " words per sentence)</em>" post_content_sections = post_content.split("<bigbreak>") id_str = '' if title: id_str = 'id="'+utils.format_for_url(title)+'"' post_content_sections[0] = '<h1><a class="post_title_link" href="'''+permalink+'">'+title+'</a></h1>'+post_content_sections[0] for i in range(0, len(post_content_sections)): post_content_sections[i] = '<div class="post_content_section">'+post_content_sections[i]+'</div>' return (''' <div '''+id_str+''' class="blog_post"> '''+(''.join(post_content_sections))+''' </div>'''+metadata_and_comments_section_html(title, permalink, taglist, stream_only, metadata, allow_comments = allow_comments, Patreon_type = Patreon_type), "".join (head))
def __init__(self, input, **options): self.input = input self.url = options.get('url', '') self.debug = options.get('debug', False) self.title = options.get('title', '^^') self.pages = options.get('pages', None) self.texts = options.get('texts', None) self.domain = get_domain(self.url) self.options = options self.doc = clean_html(input, return_doc=True) self.text = self.doc.text_content() self.len = word_count(self.text) if self.text else 0
def get_common_words(data_description_text_list): """ Returns the common words among the text descriptions in the input list. """ processed_description_text_list = sanitize_sample_descriptions( data_description_text_list) all_text = ' '.join(processed_description_text_list) counts = word_count(all_text) return [ text for text, count in sorted(counts.items(), key=operator.itemgetter(1))[::-1] ]
def word_frequency(data): word_frequency = utils.word_count(' '.join(data["michael"]) + ' '.join(data["dwight"])) word_frequency = dict( sorted(word_frequency.items(), key=lambda item: item[1], reverse=True)) _ = [print(x) for x in list(word_frequency.items())[:10]] utils.plot( list(word_frequency.keys())[:180], list(word_frequency.values())[:180], "Histogram of Word Frequencies", True, True)
def sphinx(r, audio): payload = {'count': 'invalid'} meta = {} try: phrase = r.recognize_sphinx(audio) payload['count'] = utils.word_count(phrase) meta['text'] = phrase.split() except sr.UnknownValueError: payload['error'] = "Sphinx could not understand audio" logger.error("Sphinx couldn't understand audio") except sr.RequestError as e: payload['error'] = "Sphinx error; {0}".format(e) logger.error("Sphinx error; {0}".format(e)) payload.update({'meta': meta}) return payload
def add_first_layer_features(self): ''' Add first layer of features using the udf functions from util.py. Input: ------- None Output: ------- None ''' self.df = self.df.withColumn('sentence_cnt',utils.sentence_count(self.df.reviewText)) \ .withColumn('word_cnt',utils.word_count(self.df.reviewText)) \ .withColumn('capital_cnt',utils.count_capital(self.df.reviewText)) \ .withColumn('upper_word_cnt',utils.all_caps(self.df.reviewText)) \ .withColumn('punctuation_cnt',utils.count_punctuation(self.df.reviewText)) \ .withColumn('overall_transform',utils.overall_transform(self.df.overall))
def vectorize(essay, setnum): topics = [[setnum, essay]] x_sennum = utils.sentence_num(topics) x_senlen = utils.sentence_len(topics) x_wlen = utils.word_count(topics) x_lwlen = utils.long_word_count(topics) x_pclen = utils.punctuation_count(topics) x_uclen = utils.unique_valid_word_count(topics) x_awlen = utils.average_word_length(topics) x_pslen = utils.part_of_speech_count(topics) x_uplen = utils.unique_valid_word_prop(topics) x_one = get_ngram(setnum, 1, essay) x_two = get_ngram(setnum, 2, essay) x_three = get_ngram(setnum, 3, essay) x_four = get_ngram(setnum, 4, essay) x_five = get_ngram(setnum, 5, essay) vector_dict = {'sentence_num': x_sennum, 'sentence_len': x_senlen, 'word_count': x_wlen, 'long_word_count': x_lwlen, 'punctuation_count': x_pclen, 'unique_valid_word_count': x_uclen, 'average_word_length': x_awlen, 'noun_adj_adv_count': x_pslen, 'unique_valid_word_prop': x_uplen, '1gram_frequency': x_one, '2gram_frequency': x_two, '3gram_frequency': x_three, '4gram_frequency': x_four, '5gram_frequency': x_five} return vector_dict
def wit(r, audio): payload = {'count': 'invalid'} meta = {} try: phrase = r.recognize_wit(audio, key=creds['WIT_AI_KEY']) payload['count'] = utils.word_count(phrase) meta['text'] = phrase.split() except sr.UnknownValueError: payload['error'] = "Wit.ai could not understand audio" logger.error("Wit.ai couldn't understand audio") except sr.RequestError as e: payload[ 'error'] = "Could not request results from Wit.ai service; {0}".format( e) logger.error( "Could not request results from Wit.ai service; {0}".format(e)) return payload
def google_sound_cloud(r, audio): payload = {'count': 'invalid'} meta = {} try: phrase = r.recognize_google_cloud( audio, credentials_json=creds['GOOGLE_CLOUD_SPEECH']) payload['count'] = utils.word_count(phrase) meta['text'] = phrase.split() except sr.UnknownValueError: payload['error'] = "Google Cloud Speech could not understand audio" logger.error("Google Cloud Speech couldn't understand audio") except sr.RequestError as e: payload[ 'error'] = "Could not request results from Google Cloud Speech service; {0}".format( e) logger.error( "Could not request results from Google Cloud speech Recognition service; {0}" .format(e)) payload.update({'meta': meta}) return payload
def bing(r, audio): payload = {'count': 'invalid'} meta = {} try: phrase = r.recognize_bing(audio, key=creds['BING_KEY']) payload['count'] = utils.word_count(phrase) meta['text'] = phrase.split() except sr.UnknownValueError: payload[ 'error'] = "Microsoft Bing Voice Recognition could not understand audio" logger.error( "Microsoft Bing Voice Recognition couldn't understand audio") except sr.RequestError as e: payload[ 'error'] = "Could not request results from Microsoft Bing Voice Recognition service; {0}".format( e) logger.error( "Could not request results from Microsoft Bing Voice Recognition service; {0}" .format(e)) return payload
def google(r, audio): payload = {'count': 'invalid'} meta = {} try: phrase = r.recognize_google(audio) payload['count'] = utils.word_count(phrase) meta['text'] = phrase.split() except sr.UnknownValueError: payload[ 'error'] = "Google Speech Recognizer could not understand audio" logger.error("Google Speech Recognizer couldn't understand audio") except sr.RequestError as e: payload[ 'error'] = "Could not request results from Google Speech Recognition service; {0}".format( e) logger.error( "Could not request results from Google Speech Recognition service; {0}" .format(e)) payload.update({'meta': meta}) return payload
def houndify(r, audio): payload = {'count': 'invalid'} meta = {} try: phrase = r.recognize_houndify(audio, client_id=creds['HOUNDIFY_CLIENT_ID'], client_key=creds['HOUNDIFY_CLIENT_KEY']) payload['count'] = utils.word_count(phrase) meta['text'] = phrase.split() except sr.UnknownValueError: payload['error'] = "Houndify could not understand audio" logger.error("Houndify couldn't understand audio") except sr.RequestError as e: payload[ 'error'] = "Could not request results from Houndify service; {0}".format( e) logger.error( "Could not request results from Houndify service; {0}".format(e)) payload.update({'meta': meta}) return payload
def ibm(r, audio): payload = {'count': 'invalid'} meta = {} try: phrase = r.recognize_ibm(audio, username=creds['IBM_USERNAME'], password=creds['IBM_PASSWORD']) payload['count'] = utils.word_count(phrase) meta['text'] = phrase.split() except sr.UnknownValueError: payload['error'] = "IBM Speech to Text could not understand audio" logger.error("IBM Speech to Text couldn't understand audio") except sr.RequestError as e: payload[ 'error'] = "Could not request results from IBM Speech to Text service; {0}".format( e) logger.error( "Could not request results from IBM Speech to Text service; {0}". format(e)) payload.update({'meta': meta}) return payload
def img2center(doc, title): for node in list(doc.iter()): parent = node.getparent() previous = node.getprevious() next = node.getnext() for key, value in node.attrib.items(): if key not in ['src', 'href']: node.attrib.pop(key) if key in ['style'] and 'center' in value: node.set('style', 'text-align:center') if node.tag == 'a': node.set('target', '_blank') elif str(node.tag).lower() in 'h1|h2': node.tag = 'h3' elif node.tag == 'img' and parent is not None: replace_node( '<div class="k-img" style="text-align:center;">%s</div>', node) if previous is None and parent.text and parent.text.strip() \ or previous is not None \ and (previous.tail or str(previous.tag).lower() not in 'p|div'): node.addprevious(fromstring('<br>')) if node.tail and node.tail.strip(): node.addnext(fromstring('<br>')) elif next is not None and str(next.tag).lower() not in 'p|div': next.addprevious(fromstring('<br>')) if next is not None and next.text and next.text.strip(): text = next.text.strip() if word_count(text) < 40 \ and (not re.match(u'.*[:.?!:。?!…]$', text) \ or re.search(u'制图|资料图|图片|图|摄', text)) \ and not re.search(u'(\d+|[一二三四五六七八九十]+)[、.]', text): replace_node('<div style="text-align:center;">%s</div>', next) continue if previous is None and not parent.text: pprevious = parent.getprevious() if pprevious is not None \ and not pprevious.xpath(BLOCK_XPATH): text = pprevious.text_content().strip() if word_count(text) < 40 \ and (not re.match(u'.*[:.?!:。?!…]$', text) \ or re.search(u'制图|资料图|图片|图|摄', text)) \ and not re.search(u'(\d+|[一二三四五六七八九十]+)[、.]', text): pprevious.set('style', 'text-align:center') continue if not node.tail and node.getnext() is None: pnext = parent.getnext() if pnext is not None \ and not pnext.xpath(BLOCK_XPATH): text = pnext.text_content().strip() if word_count(text) < 40 \ and (not re.match(u'.*[:.?!:。?!…]$', text) \ or re.search(u'制图|资料图|图片|图|摄', text)) \ and not re.search(u'(\d+|[一二三四五六七八九十]+)[、.]', text): pnext.set('style', 'text-align:center') continue for node in doc.iter('pre'): if not node.getchildren(): node.text = re.sub('(^|\r|\n) *\d+', '', (node.text or '')) for node in doc.iter('img'): node.set('alt', title) node.set('ttile', title)
story["pages"] = [post for post in story["pages"] if "don't deploy" not in post] for post_dict in story["pages"]: index = index + 1 #post_dict["path_prefix"] = story["url"]+"/" post_dict["long_story_name"] = name post_dict["long_story_index"] = index if "listed" in story: post_dict["listed"] = True posts ["stories"].append(post_dict) for cat,post_list in posts.items(): for post_dict in post_list: if "long_story_name" not in post_dict: post_dict["path_prefix"] = "/" if cat=="" else "/"+cat+"/" post_dict["category"] = cat post_dict["word_count"] = utils.word_count (html.unescape (utils.strip_tags (post_dict ["contents"]))) if "auto_paragraphs" in post_dict: post_dict ["contents"] = utils.auto_paragraphs (post_dict ["contents"]) if cat == 'blog': post_dict['contents'] += signature for name,story in long_stories.items(): story["word_count"] = 0 for post_dict in story["pages"]: story["word_count"] = story["word_count"] + post_dict["word_count"] css.insert (''' a.small_story {display: block; padding: 0.8em 0; color: black; text-decoration: none;} a.small_story h2 {font-weight: bold; color: black;} a.small_story .blurb {font-size:71%;}
def get_words(self): words = word_count(self.train_data) # 取常用字 return words[:len(words)] + (' ', )
def wc(self): return word_count(self.text)
def post_html(contents, title, permalink, taglist, stream_only, metadata, scrutinize = True, allow_comments = True, Patreon_type = "blog"): head = [] post_content = blog_server_shared.postprocess_post_string(contents, metadata["id"], title, False, scrutinize)[0] before_content_warnings = post_content content_warning_header_regex = re.compile(r"<content_warning_header"+blog_server_shared.grouped_string_regex("content_warning_header_contents")+">", re.DOTALL) post_content = content_warning_header_regex.sub(lambda match: (''' <div class="story_content_warning_header"> <p>This story contains:</p> '''+hidden_cw_box(''' <ul> '''+match.group("content_warning_header_contents")+''' </ul> <p>Notices will also appear in-context in the story, just before the material appears.</p> <p>If you see other material that should be marked (such as common triggers or phobias), '''+exmxaxixl.a('e-mail me')+'''. I am serious about web accessibility, and I will respond to your concerns as soon as I can manage.</p> ''')+''' </div>'''), post_content) content_warning_p_regex = re.compile(r"<content_warning_p"+blog_server_shared.grouped_string_regex("content_warning_p_contents")+">", re.DOTALL) post_content = content_warning_p_regex.sub(lambda match: secondary_hidden_cw_box('This section depicts '+match.group("content_warning_p_contents")+'.'), post_content) if post_content != before_content_warnings: head.append ("<script>window.elidupree.handle_content_warnings('"+ metadata ["id"]+"', false)</script>" ) next_transcript_number = 1 while True: transcript_generator = re.search(r"<transcript"+ blog_server_shared.grouped_string_regex("transcript_text")+">", post_content, re.DOTALL) if transcript_generator is None: break transcript_identifier_string = str(next_transcript_number)+'_'+ metadata ["id"] post_content = post_content [0: transcript_generator.start(0)]+'<div id="transcript_'+ transcript_identifier_string+'" class="transcript_block"><div class="transcript_header">Transcript: <a id="show_transcript_button_'+ transcript_identifier_string+'" href="javascript:;">(show)</a><a id="hide_transcript_button_'+ transcript_identifier_string+'" href="javascript:;">(hide)</a></div><div class="transcript_content id'+ transcript_identifier_string+'">'+ transcript_generator.group("transcript_text")+'</div></div>' + post_content [transcript_generator.end(0):] head.append('''<style> html.transcript_hidden_'''+ transcript_identifier_string +''' div.transcript_content.id'''+ transcript_identifier_string +''' {display: none;} #show_transcript_button_'''+ transcript_identifier_string +''' {display: none;} html.transcript_hidden_'''+ transcript_identifier_string +''' #show_transcript_button_'''+ transcript_identifier_string +''' {display: inline;} html.transcript_hidden_'''+ transcript_identifier_string +''' #hide_transcript_button_'''+ transcript_identifier_string +''' {display: none;} </style> <script> window.elidupree.handle_transcript ("'''+ transcript_identifier_string +'''"); </script>''') next_transcript_number = next_transcript_number + 1 if stream_only == True: cutter = re. compile ( r"<cut>.*?</p>.*$", re.DOTALL) post_content = cutter.sub ('''[...]</p> <a class="continue_reading" href="'''+ permalink +'''">Continue reading<span class="invisible"> '''+ title +'''</span>...</a>''', post_content) #this sometimes cuts off anchors, so make sure fragments point at the canonical URL post_content = re.sub ('href="#','href="' + permalink + '#', post_content) else: post_content = re.sub ("<cut>", "", post_content) calculate_readability = (stream_only != True) if calculate_readability: #using the automated readability index reference = re.sub(r"\s+", " ", html.unescape (utils.strip_tags (post_content))) sentences = len(re.findall (r"\w\w\w.*?[.?!]", reference)) words = utils.word_count (reference) characters = len(re.findall (r"\w", reference)) if words >0 and sentences >0: readability = 4.71*characters/words +0.5 *words/sentences -21.43 post_content = '<em class="debug"> Approximate readability: ' + "{:.2f}".format (readability) + " ("+ str (characters) + " characters, " + str (words) + " words, " + str (sentences) + " sentences, " + "{:.2f}".format (characters/words) + " characters per word, " + "{:.2f}".format (words/sentences) + " words per sentence)</em>" + post_content post_content_sections = post_content.split("<bigbreak>") id_str = '' if title: id_str = 'id="'+utils.format_for_url(title)+'"' post_content_sections[0] = '<h1><a class="post_title_link" href="'''+permalink+'">'+title+'</a></h1>'+post_content_sections[0] for i in range(0, len(post_content_sections)): post_content_sections[i] = '<div class="post_content_section">'+post_content_sections[i]+'</div>' return (''' <div '''+id_str+''' class="blog_post"> '''+(''.join(post_content_sections))+''' </div>'''+metadata_and_comments_section_html(title, permalink, taglist, stream_only, metadata, allow_comments = allow_comments, Patreon_type = Patreon_type), "".join (head))
def info (story): words = utils.word_count (story ["contents"]) return " [" + (story ["word_count_override"] if "word_count_override" in story else str(((words + 50)//100)*100) + " words") + "]"
def is_need_drop(self, node, short=True): if node.tag.lower() == 'img': return False if self.is_bad_node(node): return True text = node.text_content().strip() text_len = word_count(text) if text_len == 0 and not node.xpath('.//img'): return True if short and text_len < 8 and not node.xpath('.//img'): return True if short and text_len < 20 and not node.xpath('.//img') \ and re.search(u'^【.*】|^(.*)|^\(.*\)|【.*】$|(.*)$|\(.*\)$', text): return True filterRe = re.compile( u"(上一篇|下一篇|AD|热点关注|原标题|来源|编辑|标签|转自|微信|群号|微信号)[::]|" u"追究.*法律责任|关联阅读|请点击|#换成@|关注|(本文|原文|文章)(地址|标题|转自|链接|转载)|原创文章|" u"查看原文|延伸阅读|(推荐|相关)文章|转载请注明|继续浏览|正文.*结束|版 权 所 有|" u"(转载|登载|观点|允许).*(禁止|版权|本文)|(允许|禁止|版权|本文).*(转载|登载|观点)|" u"(关注|订阅|搜索|回复).*微信|微信.*(关注|订阅|搜索|回复)|【.*记者|版权声明|" u"(关注|下载).*(扫描|扫码|二维码)|(扫描|扫码|二维码).*(关注|下载)|专题:|" u"更多.*(内容|信息|文章).*请|责编|QQ群|^【.*】$|^(.*)$") if text_len / float(self.len + 1) < 0.15 or text_len < 100: if short and self.title and self.title in text: return True if emailRe.search(text) or filterRe.search(text): return True for link in node.xpath('.//a'): href = link.get('href', '') if href == self.url or self.pages and href in self.pages: return False if link.xpath('.//img') else True path = get_path(href) domain = get_domain(href) if domain == self.domain and path in ['/', ''] and link.xpath('.//img'): self.drop(link) # for img in node.xpath('.//img'): # alt = img.get('alt') # if alt and len(alt) < 50: # if re.search(u'微信二维码', alt): # return True # if len(SequenceMatcher(self.title, alt)\ # .get_matching_blocks()) / float(len(self.title)) < 0.3: # return False # title = img.get('title') # if title and len(title) < 50: # if re.search(u'微信二维码', title): # return True # if len(SequenceMatcher(self.title, title)\ # .get_matching_blocks()) / float(len(self.title)) < 0.3: # return False if node.xpath('.//img'): return 'img' return False
def is_need_drop(self, node, short=True): if node.tag.lower() == 'img': return False if self.is_bad_node(node): return True text = node.text_content().strip() text_len = word_count(text) if text_len == 0 and not node.xpath('.//img'): return True if short and text_len < 8 and not node.xpath('.//img'): return True if short and text_len < 20 and not node.xpath('.//img') \ and re.search(u'^【.*】|^(.*)|^\(.*\)|【.*】$|(.*)$|\(.*\)$', text): return True filterRe = re.compile( u"(上一篇|下一篇|AD|热点关注|原标题|来源|编辑|标签|转自|微信|群号|微信号)[::]|" u"追究.*法律责任|关联阅读|请点击|#换成@|关注|(本文|原文|文章)(地址|标题|转自|链接|转载)|原创文章|" u"查看原文|延伸阅读|(推荐|相关)文章|转载请注明|继续浏览|正文.*结束|版 权 所 有|" u"(转载|登载|观点|允许).*(禁止|版权|本文)|(允许|禁止|版权|本文).*(转载|登载|观点)|" u"(关注|订阅|搜索|回复).*微信|微信.*(关注|订阅|搜索|回复)|【.*记者|版权声明|" u"(关注|下载).*(扫描|扫码|二维码)|(扫描|扫码|二维码).*(关注|下载)|专题:|" u"更多.*(内容|信息|文章).*请|责编|QQ群|^【.*】$|^(.*)$") if text_len / float(self.len + 1) < 0.15 or text_len < 100: if short and self.title and self.title in text: return True if emailRe.search(text) or filterRe.search(text): return True for link in node.xpath('.//a'): href = link.get('href', '') if href == self.url or self.pages and href in self.pages: return False if link.xpath('.//img') else True path = get_path(href) domain = get_domain(href) if domain == self.domain and path in ['/', '' ] and link.xpath('.//img'): self.drop(link) # for img in node.xpath('.//img'): # alt = img.get('alt') # if alt and len(alt) < 50: # if re.search(u'微信二维码', alt): # return True # if len(SequenceMatcher(self.title, alt)\ # .get_matching_blocks()) / float(len(self.title)) < 0.3: # return False # title = img.get('title') # if title and len(title) < 50: # if re.search(u'微信二维码', title): # return True # if len(SequenceMatcher(self.title, title)\ # .get_matching_blocks()) / float(len(self.title)) < 0.3: # return False if node.xpath('.//img'): return 'img' return False
def img2center(doc, title): for node in list(doc.iter()): parent = node.getparent() previous = node.getprevious() next = node.getnext() for key, value in node.attrib.items(): if key not in ['src', 'href']: node.attrib.pop(key) if key in ['style'] and 'center' in value: node.set('style', 'text-align:center') if node.tag == 'a': node.set('target', '_blank') elif str(node.tag).lower() in 'h1|h2': node.tag = 'h3' elif node.tag == 'img' and parent is not None: replace_node('<div class="k-img" style="text-align:center;">%s</div>', node) if previous is None and parent.text and parent.text.strip() \ or previous is not None \ and (previous.tail or str(previous.tag).lower() not in 'p|div'): node.addprevious(fromstring('<br>')) if node.tail and node.tail.strip(): node.addnext(fromstring('<br>')) elif next is not None and str(next.tag).lower() not in 'p|div': next.addprevious(fromstring('<br>')) if next is not None and next.text and next.text.strip(): text = next.text.strip() if word_count(text) < 40 \ and (not re.match(u'.*[:.?!:。?!…]$', text) \ or re.search(u'制图|资料图|图片|图|摄', text)) \ and not re.search(u'(\d+|[一二三四五六七八九十]+)[、.]', text): replace_node('<div style="text-align:center;">%s</div>', next) continue if previous is None and not parent.text: pprevious = parent.getprevious() if pprevious is not None \ and not pprevious.xpath(BLOCK_XPATH): text = pprevious.text_content().strip() if word_count(text) < 40 \ and (not re.match(u'.*[:.?!:。?!…]$', text) \ or re.search(u'制图|资料图|图片|图|摄', text)) \ and not re.search(u'(\d+|[一二三四五六七八九十]+)[、.]', text): pprevious.set('style', 'text-align:center') continue if not node.tail and node.getnext() is None: pnext = parent.getnext() if pnext is not None \ and not pnext.xpath(BLOCK_XPATH): text = pnext.text_content().strip() if word_count(text) < 40 \ and (not re.match(u'.*[:.?!:。?!…]$', text) \ or re.search(u'制图|资料图|图片|图|摄', text)) \ and not re.search(u'(\d+|[一二三四五六七八九十]+)[、.]', text): pnext.set('style', 'text-align:center') continue for node in doc.iter('pre'): if not node.getchildren(): node.text = re.sub('(^|\r|\n) *\d+', '', (node.text or '')) for node in doc.iter('img'): node.set('alt', title) node.set('ttile', title)
from sklearn.model_selection import train_test_split import pickle as pb embed_size = 300 data_df = utils.open_csv() labels = data_df[1].values.tolist() all_text = data_df[0].values.tolist() new_text = [] ## Clean text for text in all_text: new_text.append(utils.clean_text(text)) wc = utils.word_count(cleaned_text_list=new_text) embedding_index = utils.create_embeddings_of_word2vec() vocab_to_int, int_to_vocab = utils.vocab_to_int(wc, embedding_index) word_embedding_matrix = utils.final_embedding_matrix(vocab_to_int, embedding_index) ## Change sentences to vocab_to_int representation num_sentences = [] for text in new_text: num_sentences.append([vocab_to_int[word] for word in text.split()]) max_len = 0 for seq in num_sentences: