Beispiel #1
0
def most_repeated(data):
    michael_distinct_count = utils.word_count(' '.join(data["michael"]))
    dwight_distinct_count = utils.word_count(' '.join(data["dwight"]))
    michael_distinct_count = dict(
        sorted(michael_distinct_count.items(),
               key=lambda item: item[1],
               reverse=True))
    dwight_distinct_count = dict(
        sorted(dwight_distinct_count.items(),
               key=lambda item: item[1],
               reverse=True))

    dwight_most_repeated = []
    michael_most_repeated = []
    for word, count in michael_distinct_count.items():
        if word not in dwight_distinct_count.keys():
            michael_most_repeated.append((word, count))
        if len(michael_most_repeated) >= 10:
            break

    for word, count in dwight_distinct_count.items():
        if word not in michael_distinct_count.keys():
            dwight_most_repeated.append((word, count))
        if len(dwight_most_repeated) >= 10:
            break

    print(dwight_most_repeated)
    print(michael_most_repeated)
    utils.plot([x[0] for x in dwight_most_repeated],
               [x[1] for x in dwight_most_repeated], "Dwight", True)
    utils.plot([x[0] for x in michael_most_repeated],
               [x[1] for x in michael_most_repeated], "Michael", True)
    return dwight_most_repeated, michael_most_repeated
Beispiel #2
0
def compute_RNF(docA, docB):
    wc_A = utils.word_count(docA)
    wc_B = utils.word_count(docB)
    total_A = sum([value for value in wc_A.values()])
    total_B = sum([value for value in wc_B.values()])

    RNF = {}

    for word in wc_A.keys():
        if word not in wc_B.keys():
            continue
        RNF[word] = (wc_A[word] / total_A) / (wc_B[word] / total_B)

    return dict(sorted(RNF.items(), key=lambda item: item[1], reverse=True))
Beispiel #3
0
	def is_bad_node(self, node):
		text = node.text_content().strip()

		if node.tag.lower() in 'img|br':
			return False
		
		if not text and not node.getchildren():
			return True

		for img in node.xpath('.//img'):
			if self.title in img.get('alt', '') \
					or self.title in img.get('title', ''):
				return False

		text_len = word_count(text)
		link_len, link_cnt = 0, 0
		for link in node.findall('.//a'):
			link_cnt += 1
			if not link.text_content():
				continue
			link_len += word_count(link.text_content())

		if link_cnt > 1 and text_len > 1 and link_len / float(text_len) > 0.4:
			return True

		if link_cnt > 1 and text_len / float(link_cnt) < 10:
			return True

		if link_cnt > 1 and node.cssselect('li a'):
			return True

		block_cnt = len(node.xpath(BAD_XPATH))
		if link_cnt > 0 and block_cnt > 1 and len(node.cssselect('pre')) == 0:
			return True

		if text_len / float(self.len + 1) < 0.15 or text_len < 100:
			if re.search('\d{3,}-\d+-\d+', text):
				return True
			# filterRe = re.compile(u'点击(.*)(进入|观看)|^事实\+$')
			# if filterRe.match(text):
			# 	return True

		return False
Beispiel #4
0
    def is_bad_node(self, node):
        text = node.text_content().strip()

        if node.tag.lower() in 'img|br':
            return False

        if not text and not node.getchildren():
            return True

        for img in node.xpath('.//img'):
            if self.title in img.get('alt', '') \
              or self.title in img.get('title', ''):
                return False

        text_len = word_count(text)
        link_len, link_cnt = 0, 0
        for link in node.findall('.//a'):
            link_cnt += 1
            if not link.text_content():
                continue
            link_len += word_count(link.text_content())

        if link_cnt > 1 and text_len > 1 and link_len / float(text_len) > 0.4:
            return True

        if link_cnt > 1 and text_len / float(link_cnt) < 10:
            return True

        if link_cnt > 1 and node.cssselect('li a'):
            return True

        block_cnt = len(node.xpath(BAD_XPATH))
        if link_cnt > 0 and block_cnt > 1 and len(node.cssselect('pre')) == 0:
            return True

        if text_len / float(self.len + 1) < 0.15 or text_len < 100:
            if re.search('\d{3,}-\d+-\d+', text):
                return True
            # filterRe = re.compile(u'点击(.*)(进入|观看)|^事实\+$')
            # if filterRe.match(text):
            # 	return True

        return False
def post_html(contents, title, permalink, taglist, stream_only, metadata, scrutinize = True, allow_comments = True, Patreon_type = "blog"):
  head = []
  post_content = blog_server_shared.postprocess_post_string(contents, metadata["id"], title, False, scrutinize)[0]
  
  head.append ("<script>window.elidupree.handle_content_warnings ('"+ metadata ["id"]+"', false)</script>" )

  next_transcript_number = 1
  while True:
    transcript_generator = re.search(r"<transcript"+ blog_server_shared.grouped_string_regex("transcript_text")+">", post_content, re.DOTALL)
    if transcript_generator is None:
      break
    transcript_identifier_string = str(next_transcript_number)+'_'+ metadata ["id"]
    post_content = post_content [0: transcript_generator.start(0)]+'<div id="transcript_'+ transcript_identifier_string+'" class="transcript_block"><div class="transcript_header">Transcript: <a id="show_transcript_button_'+ transcript_identifier_string+'" href="javascript:;">(show)</a><a id="hide_transcript_button_'+ transcript_identifier_string+'" href="javascript:;">(hide)</a></div><div class="transcript_content id'+ transcript_identifier_string+'">'+ transcript_generator.group("transcript_text")+'</div></div>' + post_content [transcript_generator.end(0):]
    head.append('''<style> 
html.transcript_hidden_'''+ transcript_identifier_string +''' div.transcript_content.id'''+ transcript_identifier_string +''' {display: none;}
#show_transcript_button_'''+ transcript_identifier_string +''' {display: none;}
html.transcript_hidden_'''+ transcript_identifier_string +''' #show_transcript_button_'''+ transcript_identifier_string +''' {display: inline;}
html.transcript_hidden_'''+ transcript_identifier_string +''' #hide_transcript_button_'''+ transcript_identifier_string +''' {display: none;}
    </style> 
    <script>
    window.elidupree.handle_transcript ("'''+ transcript_identifier_string +'''");
    </script>''')
    next_transcript_number = next_transcript_number + 1

  if stream_only == True:
    cutter = re. compile ( r"<cut>.*?</p>.*$", re.DOTALL)
    post_content = cutter.sub ('''[...]</p>
<a class="continue_reading" href="'''+ permalink +'''">Continue reading<span class="invisible"> '''+ title +'''</span>...</a>''', post_content)
    #this sometimes cuts off anchors, so make sure fragments point at the canonical URL
    post_content = re.sub ('href="#','href="' + permalink + '#', post_content)
  else:
    post_content = re.sub ("<cut>", "", post_content)
  
  calculate_readability = (stream_only != True)
  if calculate_readability:
    #using the automated readability index
    reference = re.sub(r"\s+", " ", html.unescape (utils.strip_tags (post_content)))
    sentences = len(re.findall (r"\w\w\w.*?[.?!]", reference))
    words = utils.word_count (reference)
    characters = len(re.findall (r"\w", reference))
    if words >0 and sentences >0:
      readability = 4.71*characters/words +0.5 *words/sentences -21.43
      post_content = post_content + '<em class="debug"> Approximate readability: ' + "{:.2f}".format (readability) + " ("+ str (characters) + " characters, " + str (words) +  " words, " + str (sentences)  + " sentences, " + "{:.2f}".format (characters/words) + " characters per word, " + "{:.2f}".format (words/sentences) + " words per sentence)</em>"
  
  post_content_sections = post_content.split("<bigbreak>")
  id_str = ''
  if title:
    id_str = 'id="'+utils.format_for_url(title)+'"'
    post_content_sections[0] = '<h1><a class="post_title_link" href="'''+permalink+'">'+title+'</a></h1>'+post_content_sections[0]
  for i in range(0, len(post_content_sections)):
    post_content_sections[i] = '<div class="post_content_section">'+post_content_sections[i]+'</div>'
  return ('''
<div '''+id_str+''' class="blog_post">
  '''+(''.join(post_content_sections))+'''
</div>'''+metadata_and_comments_section_html(title, permalink, taglist, stream_only, metadata, allow_comments = allow_comments, Patreon_type = Patreon_type), "".join (head))
Beispiel #6
0
	def __init__(self, input, **options):
		self.input = input
		self.url = options.get('url', '')
		self.debug = options.get('debug', False)
		self.title = options.get('title', '^^')
		self.pages = options.get('pages', None)
		self.texts = options.get('texts', None)
		self.domain = get_domain(self.url)
		self.options = options
		self.doc = clean_html(input, return_doc=True)
		self.text = self.doc.text_content()
		self.len = word_count(self.text) if self.text else 0
Beispiel #7
0
 def __init__(self, input, **options):
     self.input = input
     self.url = options.get('url', '')
     self.debug = options.get('debug', False)
     self.title = options.get('title', '^^')
     self.pages = options.get('pages', None)
     self.texts = options.get('texts', None)
     self.domain = get_domain(self.url)
     self.options = options
     self.doc = clean_html(input, return_doc=True)
     self.text = self.doc.text_content()
     self.len = word_count(self.text) if self.text else 0
Beispiel #8
0
def get_common_words(data_description_text_list):
    """
    Returns the common words among the text descriptions in the input list.
    """
    processed_description_text_list = sanitize_sample_descriptions(
        data_description_text_list)
    all_text = ' '.join(processed_description_text_list)
    counts = word_count(all_text)
    return [
        text for text, count in sorted(counts.items(),
                                       key=operator.itemgetter(1))[::-1]
    ]
Beispiel #9
0
def word_frequency(data):
    word_frequency = utils.word_count(' '.join(data["michael"]) +
                                      ' '.join(data["dwight"]))

    word_frequency = dict(
        sorted(word_frequency.items(), key=lambda item: item[1], reverse=True))

    _ = [print(x) for x in list(word_frequency.items())[:10]]

    utils.plot(
        list(word_frequency.keys())[:180],
        list(word_frequency.values())[:180], "Histogram of Word Frequencies",
        True, True)
Beispiel #10
0
def sphinx(r, audio):

    payload = {'count': 'invalid'}
    meta = {}

    try:
        phrase = r.recognize_sphinx(audio)
        payload['count'] = utils.word_count(phrase)
        meta['text'] = phrase.split()
    except sr.UnknownValueError:
        payload['error'] = "Sphinx could not understand audio"
        logger.error("Sphinx couldn't understand audio")
    except sr.RequestError as e:
        payload['error'] = "Sphinx error; {0}".format(e)
        logger.error("Sphinx error; {0}".format(e))

    payload.update({'meta': meta})
    return payload
Beispiel #11
0
    def add_first_layer_features(self):
        '''
        Add first layer of features using the udf functions from util.py.

        Input:
        -------
        None

        Output:
        -------
        None
        '''
        self.df = self.df.withColumn('sentence_cnt',utils.sentence_count(self.df.reviewText)) \
                         .withColumn('word_cnt',utils.word_count(self.df.reviewText)) \
                         .withColumn('capital_cnt',utils.count_capital(self.df.reviewText)) \
                         .withColumn('upper_word_cnt',utils.all_caps(self.df.reviewText)) \
                         .withColumn('punctuation_cnt',utils.count_punctuation(self.df.reviewText)) \
                         .withColumn('overall_transform',utils.overall_transform(self.df.overall))
Beispiel #12
0
def vectorize(essay, setnum):
    topics = [[setnum, essay]]
    x_sennum = utils.sentence_num(topics)
    x_senlen = utils.sentence_len(topics)
    x_wlen = utils.word_count(topics)
    x_lwlen = utils.long_word_count(topics)
    x_pclen = utils.punctuation_count(topics)
    x_uclen = utils.unique_valid_word_count(topics)
    x_awlen = utils.average_word_length(topics)
    x_pslen = utils.part_of_speech_count(topics)
    x_uplen = utils.unique_valid_word_prop(topics)
    x_one = get_ngram(setnum, 1, essay)
    x_two = get_ngram(setnum, 2, essay)
    x_three = get_ngram(setnum, 3, essay)
    x_four = get_ngram(setnum, 4, essay)
    x_five = get_ngram(setnum, 5, essay)
    vector_dict = {'sentence_num': x_sennum, 'sentence_len': x_senlen, 'word_count': x_wlen, 'long_word_count': x_lwlen, 'punctuation_count': x_pclen, 'unique_valid_word_count': x_uclen, 'average_word_length': x_awlen,
                    'noun_adj_adv_count': x_pslen, 'unique_valid_word_prop': x_uplen, '1gram_frequency': x_one, '2gram_frequency': x_two, '3gram_frequency': x_three, '4gram_frequency': x_four, '5gram_frequency': x_five}
    return vector_dict
Beispiel #13
0
def wit(r, audio):

    payload = {'count': 'invalid'}
    meta = {}

    try:
        phrase = r.recognize_wit(audio, key=creds['WIT_AI_KEY'])
        payload['count'] = utils.word_count(phrase)
        meta['text'] = phrase.split()
    except sr.UnknownValueError:
        payload['error'] = "Wit.ai could not understand audio"
        logger.error("Wit.ai couldn't understand audio")
    except sr.RequestError as e:
        payload[
            'error'] = "Could not request results from Wit.ai service; {0}".format(
                e)
        logger.error(
            "Could not request results from Wit.ai service; {0}".format(e))

    return payload
Beispiel #14
0
def google_sound_cloud(r, audio):

    payload = {'count': 'invalid'}
    meta = {}

    try:
        phrase = r.recognize_google_cloud(
            audio, credentials_json=creds['GOOGLE_CLOUD_SPEECH'])
        payload['count'] = utils.word_count(phrase)
        meta['text'] = phrase.split()
    except sr.UnknownValueError:
        payload['error'] = "Google Cloud Speech could not understand audio"
        logger.error("Google Cloud Speech couldn't understand audio")
    except sr.RequestError as e:
        payload[
            'error'] = "Could not request results from Google Cloud Speech service; {0}".format(
                e)
        logger.error(
            "Could not request results from Google Cloud speech Recognition service; {0}"
            .format(e))

    payload.update({'meta': meta})
    return payload
Beispiel #15
0
def bing(r, audio):

    payload = {'count': 'invalid'}
    meta = {}

    try:
        phrase = r.recognize_bing(audio, key=creds['BING_KEY'])
        payload['count'] = utils.word_count(phrase)
        meta['text'] = phrase.split()
    except sr.UnknownValueError:
        payload[
            'error'] = "Microsoft Bing Voice Recognition could not understand audio"
        logger.error(
            "Microsoft Bing Voice Recognition  couldn't understand audio")
    except sr.RequestError as e:
        payload[
            'error'] = "Could not request results from Microsoft Bing Voice Recognition service; {0}".format(
                e)
        logger.error(
            "Could not request results from Microsoft Bing Voice Recognition service; {0}"
            .format(e))

    return payload
Beispiel #16
0
def google(r, audio):

    payload = {'count': 'invalid'}
    meta = {}

    try:
        phrase = r.recognize_google(audio)
        payload['count'] = utils.word_count(phrase)
        meta['text'] = phrase.split()
    except sr.UnknownValueError:
        payload[
            'error'] = "Google Speech Recognizer could not understand audio"
        logger.error("Google Speech Recognizer couldn't understand audio")
    except sr.RequestError as e:
        payload[
            'error'] = "Could not request results from Google Speech Recognition service; {0}".format(
                e)
        logger.error(
            "Could not request results from Google Speech Recognition service; {0}"
            .format(e))

    payload.update({'meta': meta})
    return payload
Beispiel #17
0
def houndify(r, audio):

    payload = {'count': 'invalid'}
    meta = {}

    try:
        phrase = r.recognize_houndify(audio,
                                      client_id=creds['HOUNDIFY_CLIENT_ID'],
                                      client_key=creds['HOUNDIFY_CLIENT_KEY'])
        payload['count'] = utils.word_count(phrase)
        meta['text'] = phrase.split()
    except sr.UnknownValueError:
        payload['error'] = "Houndify could not understand audio"
        logger.error("Houndify couldn't understand audio")
    except sr.RequestError as e:
        payload[
            'error'] = "Could not request results from Houndify service; {0}".format(
                e)
        logger.error(
            "Could not request results from Houndify service; {0}".format(e))

    payload.update({'meta': meta})
    return payload
Beispiel #18
0
def ibm(r, audio):

    payload = {'count': 'invalid'}
    meta = {}

    try:
        phrase = r.recognize_ibm(audio,
                                 username=creds['IBM_USERNAME'],
                                 password=creds['IBM_PASSWORD'])
        payload['count'] = utils.word_count(phrase)
        meta['text'] = phrase.split()
    except sr.UnknownValueError:
        payload['error'] = "IBM Speech to Text could not understand audio"
        logger.error("IBM Speech to Text couldn't understand audio")
    except sr.RequestError as e:
        payload[
            'error'] = "Could not request results from IBM Speech to Text service; {0}".format(
                e)
        logger.error(
            "Could not request results from IBM Speech to Text service; {0}".
            format(e))

    payload.update({'meta': meta})
    return payload
Beispiel #19
0
def img2center(doc, title):
    for node in list(doc.iter()):
        parent = node.getparent()
        previous = node.getprevious()
        next = node.getnext()

        for key, value in node.attrib.items():
            if key not in ['src', 'href']:
                node.attrib.pop(key)
            if key in ['style'] and 'center' in value:
                node.set('style', 'text-align:center')

        if node.tag == 'a':
            node.set('target', '_blank')
        elif str(node.tag).lower() in 'h1|h2':
            node.tag = 'h3'
        elif node.tag == 'img' and parent is not None:
            replace_node(
                '<div class="k-img" style="text-align:center;">%s</div>', node)

            if previous is None and parent.text and parent.text.strip() \
              or previous is not None \
              and (previous.tail or str(previous.tag).lower() not in 'p|div'):
                node.addprevious(fromstring('<br>'))
            if node.tail and node.tail.strip():
                node.addnext(fromstring('<br>'))
            elif next is not None and str(next.tag).lower() not in 'p|div':
                next.addprevious(fromstring('<br>'))

            if next is not None and next.text and next.text.strip():
                text = next.text.strip()
                if word_count(text) < 40 \
                  and (not re.match(u'.*[:.?!:。?!…]$', text) \
                   or re.search(u'制图|资料图|图片|图|摄', text)) \
                  and not re.search(u'(\d+|[一二三四五六七八九十]+)[、.]', text):
                    replace_node('<div style="text-align:center;">%s</div>',
                                 next)
                    continue

            if previous is None and not parent.text:
                pprevious = parent.getprevious()
                if pprevious is not None \
                  and not pprevious.xpath(BLOCK_XPATH):
                    text = pprevious.text_content().strip()
                    if word_count(text) < 40 \
                      and (not re.match(u'.*[:.?!:。?!…]$', text) \
                       or re.search(u'制图|资料图|图片|图|摄', text)) \
                      and not re.search(u'(\d+|[一二三四五六七八九十]+)[、.]', text):
                        pprevious.set('style', 'text-align:center')
                        continue

            if not node.tail and node.getnext() is None:
                pnext = parent.getnext()
                if pnext is not None \
                  and not pnext.xpath(BLOCK_XPATH):
                    text = pnext.text_content().strip()
                    if word_count(text) < 40 \
                      and (not re.match(u'.*[:.?!:。?!…]$', text) \
                       or re.search(u'制图|资料图|图片|图|摄', text)) \
                      and not re.search(u'(\d+|[一二三四五六七八九十]+)[、.]', text):
                        pnext.set('style', 'text-align:center')
                        continue

    for node in doc.iter('pre'):
        if not node.getchildren():
            node.text = re.sub('(^|\r|\n) *\d+', '', (node.text or ''))

    for node in doc.iter('img'):
        node.set('alt', title)
        node.set('ttile', title)
    story["pages"] = [post for post in story["pages"] if "don't deploy" not in post]
  for post_dict in story["pages"]:
    index = index + 1
    #post_dict["path_prefix"] = story["url"]+"/"
    post_dict["long_story_name"] = name
    post_dict["long_story_index"] = index
    if "listed" in story:
      post_dict["listed"] = True
    posts ["stories"].append(post_dict)
    
for cat,post_list in posts.items():
  for post_dict in post_list:
    if "long_story_name" not in post_dict:
      post_dict["path_prefix"] = "/" if cat=="" else "/"+cat+"/"
    post_dict["category"] = cat
    post_dict["word_count"] = utils.word_count (html.unescape (utils.strip_tags (post_dict ["contents"])))
    if "auto_paragraphs" in post_dict:
      post_dict ["contents"] = utils.auto_paragraphs (post_dict ["contents"])
    if cat == 'blog':
      post_dict['contents'] += signature
      
for name,story in long_stories.items():
  story["word_count"] = 0
  for post_dict in story["pages"]:
    story["word_count"] = story["word_count"] + post_dict["word_count"]


css.insert ('''
a.small_story {display: block; padding: 0.8em 0; color: black; text-decoration: none;}
a.small_story h2 {font-weight: bold; color: black;}
a.small_story .blurb {font-size:71%;}
Beispiel #21
0
 def get_words(self):
     words = word_count(self.train_data)
     # 取常用字
     return words[:len(words)] + (' ', )
Beispiel #22
0
 def wc(self):
     return word_count(self.text)
Beispiel #23
0
 def wc(self):
     return word_count(self.text)
Beispiel #24
0
def post_html(contents, title, permalink, taglist, stream_only, metadata, scrutinize = True, allow_comments = True, Patreon_type = "blog"):
  head = []
  post_content = blog_server_shared.postprocess_post_string(contents, metadata["id"], title, False, scrutinize)[0]
  
  before_content_warnings = post_content
  
  content_warning_header_regex = re.compile(r"<content_warning_header"+blog_server_shared.grouped_string_regex("content_warning_header_contents")+">", re.DOTALL)
  post_content = content_warning_header_regex.sub(lambda match: ('''

<div class="story_content_warning_header">
  <p>This story contains:</p>
  '''+hidden_cw_box('''
  <ul>
    '''+match.group("content_warning_header_contents")+'''
  </ul>
  <p>Notices will also appear in-context in the story, just before the material appears.</p>
  <p>If you see other material that should be marked (such as common triggers or phobias), '''+exmxaxixl.a('e-mail me')+'''. I am serious about web accessibility, and I will respond to your concerns as soon as I can manage.</p>
  ''')+'''
</div>'''), post_content)

  content_warning_p_regex = re.compile(r"<content_warning_p"+blog_server_shared.grouped_string_regex("content_warning_p_contents")+">", re.DOTALL)
  post_content = content_warning_p_regex.sub(lambda match: secondary_hidden_cw_box('This section depicts '+match.group("content_warning_p_contents")+'.'), post_content)
  
  if post_content != before_content_warnings:
    head.append ("<script>window.elidupree.handle_content_warnings('"+ metadata ["id"]+"', false)</script>" )

  next_transcript_number = 1
  while True:
    transcript_generator = re.search(r"<transcript"+ blog_server_shared.grouped_string_regex("transcript_text")+">", post_content, re.DOTALL)
    if transcript_generator is None:
      break
    transcript_identifier_string = str(next_transcript_number)+'_'+ metadata ["id"]
    post_content = post_content [0: transcript_generator.start(0)]+'<div id="transcript_'+ transcript_identifier_string+'" class="transcript_block"><div class="transcript_header">Transcript: <a id="show_transcript_button_'+ transcript_identifier_string+'" href="javascript:;">(show)</a><a id="hide_transcript_button_'+ transcript_identifier_string+'" href="javascript:;">(hide)</a></div><div class="transcript_content id'+ transcript_identifier_string+'">'+ transcript_generator.group("transcript_text")+'</div></div>' + post_content [transcript_generator.end(0):]
    head.append('''<style> 
html.transcript_hidden_'''+ transcript_identifier_string +''' div.transcript_content.id'''+ transcript_identifier_string +''' {display: none;}
#show_transcript_button_'''+ transcript_identifier_string +''' {display: none;}
html.transcript_hidden_'''+ transcript_identifier_string +''' #show_transcript_button_'''+ transcript_identifier_string +''' {display: inline;}
html.transcript_hidden_'''+ transcript_identifier_string +''' #hide_transcript_button_'''+ transcript_identifier_string +''' {display: none;}
    </style> 
    <script>
    window.elidupree.handle_transcript ("'''+ transcript_identifier_string +'''");
    </script>''')
    next_transcript_number = next_transcript_number + 1

  if stream_only == True:
    cutter = re. compile ( r"<cut>.*?</p>.*$", re.DOTALL)
    post_content = cutter.sub ('''[...]</p>
<a class="continue_reading" href="'''+ permalink +'''">Continue reading<span class="invisible"> '''+ title +'''</span>...</a>''', post_content)
    #this sometimes cuts off anchors, so make sure fragments point at the canonical URL
    post_content = re.sub ('href="#','href="' + permalink + '#', post_content)
  else:
    post_content = re.sub ("<cut>", "", post_content)
  
  calculate_readability = (stream_only != True)
  if calculate_readability:
    #using the automated readability index
    reference = re.sub(r"\s+", " ", html.unescape (utils.strip_tags (post_content)))
    sentences = len(re.findall (r"\w\w\w.*?[.?!]", reference))
    words = utils.word_count (reference)
    characters = len(re.findall (r"\w", reference))
    if words >0 and sentences >0:
      readability = 4.71*characters/words +0.5 *words/sentences -21.43
      post_content = '<em class="debug"> Approximate readability: ' + "{:.2f}".format (readability) + " ("+ str (characters) + " characters, " + str (words) +  " words, " + str (sentences)  + " sentences, " + "{:.2f}".format (characters/words) + " characters per word, " + "{:.2f}".format (words/sentences) + " words per sentence)</em>" + post_content
  
  post_content_sections = post_content.split("<bigbreak>")
  id_str = ''
  if title:
    id_str = 'id="'+utils.format_for_url(title)+'"'
    post_content_sections[0] = '<h1><a class="post_title_link" href="'''+permalink+'">'+title+'</a></h1>'+post_content_sections[0]
  for i in range(0, len(post_content_sections)):
    post_content_sections[i] = '<div class="post_content_section">'+post_content_sections[i]+'</div>'
  return ('''
<div '''+id_str+''' class="blog_post">
  '''+(''.join(post_content_sections))+'''
</div>'''+metadata_and_comments_section_html(title, permalink, taglist, stream_only, metadata, allow_comments = allow_comments, Patreon_type = Patreon_type), "".join (head))
Beispiel #25
0
 def info (story):
   words = utils.word_count (story ["contents"])
   return " [" + (story ["word_count_override"] if "word_count_override" in story else str(((words + 50)//100)*100) + " words") + "]"
Beispiel #26
0
	def is_need_drop(self, node, short=True):
		if node.tag.lower() == 'img':
			return False

		if self.is_bad_node(node):
			return True

		text = node.text_content().strip()
		text_len = word_count(text)

		if text_len == 0 and not node.xpath('.//img'):
			return True

		if short and text_len < 8 and not node.xpath('.//img'):
			return True

		if short and text_len < 20 and not node.xpath('.//img') \
				and re.search(u'^【.*】|^(.*)|^\(.*\)|【.*】$|(.*)$|\(.*\)$', text):
			return True

		filterRe = re.compile(
			u"(上一篇|下一篇|AD|热点关注|原标题|来源|编辑|标签|转自|微信|群号|微信号)[::]|"
			u"追究.*法律责任|关联阅读|请点击|#换成@|关注|(本文|原文|文章)(地址|标题|转自|链接|转载)|原创文章|"
			u"查看原文|延伸阅读|(推荐|相关)文章|转载请注明|继续浏览|正文.*结束|版 权 所 有|"
			u"(转载|登载|观点|允许).*(禁止|版权|本文)|(允许|禁止|版权|本文).*(转载|登载|观点)|"
			u"(关注|订阅|搜索|回复).*微信|微信.*(关注|订阅|搜索|回复)|【.*记者|版权声明|"
			u"(关注|下载).*(扫描|扫码|二维码)|(扫描|扫码|二维码).*(关注|下载)|专题:|"
			u"更多.*(内容|信息|文章).*请|责编|QQ群|^【.*】$|^(.*)$")

		if text_len / float(self.len + 1) < 0.15 or text_len < 100:
			if short and self.title and self.title in text:
				return True
			if emailRe.search(text) or filterRe.search(text):
				return True

		for link in node.xpath('.//a'):
			href = link.get('href', '')
			if href == self.url or self.pages and href in self.pages:
				return False if link.xpath('.//img') else True
			path = get_path(href)
			domain = get_domain(href)
			if domain == self.domain and path in ['/', ''] and link.xpath('.//img'):
				self.drop(link)

		# for img in node.xpath('.//img'):
		# 	alt = img.get('alt')
		# 	if alt and len(alt) < 50:
		# 		if re.search(u'微信二维码', alt):
		# 			return True
		# 		if len(SequenceMatcher(self.title, alt)\
		# 				.get_matching_blocks()) / float(len(self.title)) < 0.3:
		# 			return False
			
		# 	title = img.get('title')
		# 	if title and len(title) < 50:
		# 		if re.search(u'微信二维码', title):
		# 			return True
		# 		if len(SequenceMatcher(self.title, title)\
		# 				.get_matching_blocks()) / float(len(self.title)) < 0.3:
		# 			return False

		if node.xpath('.//img'):
			return 'img'

		return False
Beispiel #27
0
    def is_need_drop(self, node, short=True):
        if node.tag.lower() == 'img':
            return False

        if self.is_bad_node(node):
            return True

        text = node.text_content().strip()
        text_len = word_count(text)

        if text_len == 0 and not node.xpath('.//img'):
            return True

        if short and text_len < 8 and not node.xpath('.//img'):
            return True

        if short and text_len < 20 and not node.xpath('.//img') \
          and re.search(u'^【.*】|^(.*)|^\(.*\)|【.*】$|(.*)$|\(.*\)$', text):
            return True

        filterRe = re.compile(
            u"(上一篇|下一篇|AD|热点关注|原标题|来源|编辑|标签|转自|微信|群号|微信号)[::]|"
            u"追究.*法律责任|关联阅读|请点击|#换成@|关注|(本文|原文|文章)(地址|标题|转自|链接|转载)|原创文章|"
            u"查看原文|延伸阅读|(推荐|相关)文章|转载请注明|继续浏览|正文.*结束|版 权 所 有|"
            u"(转载|登载|观点|允许).*(禁止|版权|本文)|(允许|禁止|版权|本文).*(转载|登载|观点)|"
            u"(关注|订阅|搜索|回复).*微信|微信.*(关注|订阅|搜索|回复)|【.*记者|版权声明|"
            u"(关注|下载).*(扫描|扫码|二维码)|(扫描|扫码|二维码).*(关注|下载)|专题:|"
            u"更多.*(内容|信息|文章).*请|责编|QQ群|^【.*】$|^(.*)$")

        if text_len / float(self.len + 1) < 0.15 or text_len < 100:
            if short and self.title and self.title in text:
                return True
            if emailRe.search(text) or filterRe.search(text):
                return True

        for link in node.xpath('.//a'):
            href = link.get('href', '')
            if href == self.url or self.pages and href in self.pages:
                return False if link.xpath('.//img') else True
            path = get_path(href)
            domain = get_domain(href)
            if domain == self.domain and path in ['/', ''
                                                  ] and link.xpath('.//img'):
                self.drop(link)

        # for img in node.xpath('.//img'):
        # 	alt = img.get('alt')
        # 	if alt and len(alt) < 50:
        # 		if re.search(u'微信二维码', alt):
        # 			return True
        # 		if len(SequenceMatcher(self.title, alt)\
        # 				.get_matching_blocks()) / float(len(self.title)) < 0.3:
        # 			return False

        # 	title = img.get('title')
        # 	if title and len(title) < 50:
        # 		if re.search(u'微信二维码', title):
        # 			return True
        # 		if len(SequenceMatcher(self.title, title)\
        # 				.get_matching_blocks()) / float(len(self.title)) < 0.3:
        # 			return False

        if node.xpath('.//img'):
            return 'img'

        return False
Beispiel #28
0
def img2center(doc, title):
	for node in list(doc.iter()):
		parent = node.getparent()
		previous = node.getprevious()
		next = node.getnext()
	
		for key, value in node.attrib.items():
			if key not in ['src', 'href']:
				node.attrib.pop(key)
			if key in ['style'] and 'center' in value:
				node.set('style', 'text-align:center')

		if node.tag == 'a':
			node.set('target', '_blank')
		elif str(node.tag).lower() in 'h1|h2':
			node.tag = 'h3'
		elif node.tag == 'img' and parent is not None:
			replace_node('<div class="k-img" style="text-align:center;">%s</div>', node)

			if previous is None and parent.text and parent.text.strip() \
					or previous is not None \
					and (previous.tail or str(previous.tag).lower() not in 'p|div'):
				node.addprevious(fromstring('<br>'))
			if node.tail and node.tail.strip():
				node.addnext(fromstring('<br>'))
			elif next is not None and str(next.tag).lower() not in 'p|div':
				next.addprevious(fromstring('<br>'))

			if next is not None and next.text and next.text.strip():
				text = next.text.strip()
				if word_count(text) < 40 \
						and (not re.match(u'.*[:.?!:。?!…]$', text) \
							or re.search(u'制图|资料图|图片|图|摄', text)) \
						and not re.search(u'(\d+|[一二三四五六七八九十]+)[、.]', text):
					replace_node('<div style="text-align:center;">%s</div>', next)
					continue

			if previous is None and not parent.text:
				pprevious = parent.getprevious()
				if pprevious is not None \
						and not pprevious.xpath(BLOCK_XPATH):
					text = pprevious.text_content().strip()
					if word_count(text) < 40 \
							and (not re.match(u'.*[:.?!:。?!…]$', text) \
								or re.search(u'制图|资料图|图片|图|摄', text)) \
							and not re.search(u'(\d+|[一二三四五六七八九十]+)[、.]', text):
						pprevious.set('style', 'text-align:center')
						continue

			if not node.tail and node.getnext() is None:
				pnext = parent.getnext()
				if pnext is not None \
						and not pnext.xpath(BLOCK_XPATH):
					text = pnext.text_content().strip()
					if word_count(text) < 40 \
							and (not re.match(u'.*[:.?!:。?!…]$', text) \
								or re.search(u'制图|资料图|图片|图|摄', text)) \
							and not re.search(u'(\d+|[一二三四五六七八九十]+)[、.]', text):
						pnext.set('style', 'text-align:center')
						continue

	for node in doc.iter('pre'):
		if not node.getchildren():
			node.text = re.sub('(^|\r|\n) *\d+', '', (node.text or ''))

	for node in doc.iter('img'):
		node.set('alt', title)
		node.set('ttile', title)
Beispiel #29
0
from sklearn.model_selection import train_test_split
import pickle as pb

embed_size = 300

data_df = utils.open_csv()
labels = data_df[1].values.tolist()

all_text = data_df[0].values.tolist()
new_text = []

## Clean text
for text in all_text:
    new_text.append(utils.clean_text(text))

wc = utils.word_count(cleaned_text_list=new_text)

embedding_index = utils.create_embeddings_of_word2vec()

vocab_to_int, int_to_vocab = utils.vocab_to_int(wc, embedding_index)

word_embedding_matrix = utils.final_embedding_matrix(vocab_to_int,
                                                     embedding_index)

## Change sentences to vocab_to_int representation
num_sentences = []
for text in new_text:
    num_sentences.append([vocab_to_int[word] for word in text.split()])

max_len = 0
for seq in num_sentences: