def __init__(self): ''' tr4s: extract keysentences tr4w: extract keywords filter: clean the text and use some magic regex ''' self.tr4s = TextRank4Sentence(stop_words_file=stopword_path) self.tr4w = TextRank4Keyword(stop_words_file=stopword_path) self.filter = ArticleFilter()
def __init__(self): self.filter = ArticleFilter() self.SENTENCE_START = '<s>' self.SENTENCE_END = '</s>' dm_single_close_quote = u'\u2019' # unicode dm_double_close_quote = u'\u201d' self.END_TOKENS = [ '.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote, ")" ]
def __init__(self): ''' filter: clean the text and use some magic regex remove_tag: skip these tags when generating news data_path: path of data directory analyzier: analyze the crawled article template: handle the taiwan journalist template ''' self.filter = ArticleFilter() self.remove_tag = ['公告', '協尋', '新聞'] self.data_path = os.path.join(os.getenv("DATA"), "raw" ) self.analyzier = Analyzier() self.template = Template()
import os import sys import jieba import schedule import time import subprocess import re import os #from db.db import DB dict_path = os.path.join(os.getenv("JIEBA_DATA"), "dict.txt.big") jieba.set_dictionary(dict_path) #db = DB(os.getenv('DATA')) crawler = PttWebCrawler() analyzier = Analyzier() Filter = ArticleFilter() interface = Interface() engine = SearchEngine(os.getenv('DOC'), os.getenv('TFIDF_DATA')) def get_url(texts): for text in texts: urls = analyzier.get_url(text) image_urls = [] for url in urls: image_url = analyzier.open_url(url) if image_url != None: return image_url def generate_post(board, article, summary, response, title, paragraph, article_url, url, image_url):
class Analyzier: ''' Analyze the ptt article ''' def __init__(self): ''' tr4s: extract keysentences tr4w: extract keywords filter: clean the text and use some magic regex ''' self.tr4s = TextRank4Sentence(stop_words_file=stopword_path) self.tr4w = TextRank4Keyword(stop_words_file=stopword_path) self.filter = ArticleFilter() def get_url(self, content): ''' get the url from content Args: url: string Return: list of string ''' return self.filter.get_url(content) def get_content_len(self, content): ''' get the length of content Args: content: string Return: integer ''' clean_content = self.filter.clean_content(content=content) return len(clean_content.split('\n')) def get_response_num(self, responses): ''' get the number of useful responses Args: responses: list of dict Return: integer ''' clean_responses = self.filter.clean_responses(responses, self.filter.stopwords) print('Response: {}, Clean response: {}'.format( len(responses), len(clean_responses))) return len(clean_responses) def get_response_url(self, responses): ''' get all url from the responses Args: responses: list of dict Return: list of string ''' urls = [] for response in responses: content = re.sub('\ +', '//', response['Content']) urls += self.filter.get_url(response['Content']) urls += self.filter.get_url(content) return list(set(urls)) def check_article(self, content): ''' check whether the article is 'Give P Coin' article Args: content: string Return: bool ''' pattern = '[0-9]+?P' reward = re.findall(pattern, content) if len(reward) > 0: return True else: return False def open_url(self, url): ''' Try to get the image url from the input url If the input url is not a image and open graph protocal gives nothing, it returns 'None' Args: url: string Return: string or None ''' def get_type(url): print(url) try: with urllib.request.urlopen(url) as response: mime_type = response.info().get_content_type() print(mime_type) return mime_type.split('/') except: return [None, None] if get_type(url)[0] == 'image': return url else: print('try og') try: page = metadata_parser.MetadataParser(url=url) image_link = page.get_metadata_link('image') if image_link != None: #image_url.append(image_link) return image_link except: return None def find_summary(self, content, summary_num=5, debug=True): ''' generate the summary from input content Args: content: string summary_num: integer, how many summary you want debug: bool, whether to print the result Return: list of string ''' clean_content = self.filter.clean_content(content=content) # at most extract (content / 5) sentences from content max_num = int(len(clean_content.split('\n')) / 5) num = max_num if max_num > summary_num * 2 else summary_num * 2 if len(clean_content) < 150: # if the article is short enough num = 1e6 if num > 20: # if the article is too long num = 20 key_sentences = self.extract_key_sentences(clean_content, sort_by_index=True, num=num) key_sentences = [x[2] for x in key_sentences] if debug: print('Original content:', content) print('Cleaned content:', clean_content) print('Length of cleaned content:', len(clean_content)) print('Key sentences:', key_sentences) print('Num of key sentences', len(key_sentences)) ''' Divide the keysentences into $summary_num part ''' summarys = [] summary_len = [1 for _ in range(summary_num) ] # each part deserve 1 sentence rest = len(key_sentences) - summary_num # num of remain sentences if rest > summary_num: # equally distribute the key sentences to all part factor = int(rest / summary_num) rest = rest - (factor * summary_num) summary_len = [x + factor for x in summary_len] rest_count = 0 for i in range(len(summary_len)): if rest > 0: # assign the remain sentences if we have summary_len[i] += 1 rest -= 1 summarys.append(','.join(key_sentences[rest_count:rest_count + summary_len[i]])) rest_count += summary_len[i] return summarys def find_useful_response(self, responses, num=5): clean_responses = self.filter.clean_responses(responses, self.filter.stopwords) if len(clean_responses) != 0: responses = clean_responses ''' preserve the original responses merge the all responses into one article ''' response_dict = {} all_response = '' for response in responses: all_response += response['Content'].replace(' ', '') + '\n' response_dict[response['Content'].replace(' ', '').strip()] = response # run text rank key_responses = self.extract_key_sentences(all_response, sort_by_index=False, num=num) important_responses = [] # restore the responses for r in key_responses: if r[2].strip() in response_dict.keys(): response = response_dict[r[2].strip()] author, content = response['User'], response['Content'] ipdatetime, vote = response['Ipdatetime'], response['Vote'] else: author, content = 'unk', r[2] ipdatetime, vote = 'unk', 'unk' content = re.sub('\ +', ',', content) important_responses.append({ 'author': author, 'content': content, 'vote': vote, 'ipdatetime': ipdatetime }) return important_responses def extract_keywords(self, content): ''' extract the keywords from content Args: content: string Return: key_words: list of string ''' clean_content = self.filter.clean_content(content=content) self.tr4w.analyze(text=clean_content, lower=True, window=2) key_words = [] for item in self.tr4w.get_keywords(20, word_min_len=1): #print(item.word, item.weight) key_words.append(item.word) return key_words def extract_key_sentences(self, content, sort_by_index=False, num=5): ''' extract keysentences from content Args: content: string sort_by_index: bool, whether sort the output by line index num: integer, how many sentences we want Return: list of information of key_sentences ''' self.tr4s.analyze(text=content, lower=True, source='all_filters') key_sentences = [] for item in self.tr4s.get_key_sentences(num=num, sentence_min_len=1): key_sentences.append([item.index, item.weight, item.sentence]) #print(item.index, item.weight, item.sentence) #print('=====') def index(x): return x[0] def weight(x): return x[1] return sorted(key_sentences, key=index) if sort_by_index else sorted( key_sentences, key=weight)
class Interface: def __init__(self): self.filter = ArticleFilter() self.SENTENCE_START = '<s>' self.SENTENCE_END = '</s>' dm_single_close_quote = u'\u2019' # unicode dm_double_close_quote = u'\u201d' self.END_TOKENS = [ '.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote, ")" ] def prepare_news(self, content, path): content = self.filter.clean_content(content) content_cutted = ' '.join(jieba.cut(content.strip(), cut_all=False)) content_splitted = re.sub('\ +', '\n', content_cutted).strip() content_splitted = re.sub('\n+', '\n', content_splitted) with open(path, 'w') as f: f.write(content_splitted + '\n') def read_text_file(self, text_file): lines = [] with open(text_file, "r") as f: for line in f: if len(line.strip()) == 0: continue lines.append(line.strip()) return lines def fix_missing_period(self, line): """Adds a period to a line that is missing a period""" if "@highlight" in line: return line if line == "": return line if line[-1] in self.END_TOKENS: return line # print line[-1] return line + " ." def chunk_file(self, in_file, chunks_dir): reader = open(in_file, "rb") chunk = 0 finished = False while not finished: chunk_fname = os.path.join(chunks_dir, '%s_%03d.bin' % ('test', chunk)) # new chunk with open(chunk_fname, 'wb') as writer: for _ in range(1000): len_bytes = reader.read(8) if not len_bytes: finished = True break str_len = struct.unpack('q', len_bytes)[0] example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0] writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, example_str)) chunk += 1 def get_art_abs(self, story_file): lines = self.read_text_file(story_file) # Lowercase everything lines = [line.lower() for line in lines] # Put periods on the ends of lines that are missing them (this is a problem in the dataset because many image captions don't end in periods; consequently they end up in the body of the article as run-on sentences) lines = [self.fix_missing_period(line) for line in lines] # Separate out article and abstract sentences article_lines = [] highlights = [] next_is_highlight = False for idx, line in enumerate(lines): if line == "": continue # empty line elif line.startswith("@highlight"): next_is_highlight = True elif next_is_highlight: highlights.append(line) else: article_lines.append(line) # Make article into a single string article = ' '.join(article_lines) # Make abstract into a signle string, putting <s> and </s> tags around the sentences abstract = ' '.join([ "%s %s %s" % (self.SENTENCE_START, sent, self.SENTENCE_END) for sent in highlights ]) if len(highlights) == 0: abstract = 'test' #print(abstract) return article, abstract def write_to_bin(self, news_dir, out_file): story_fnames = os.listdir(news_dir) num_stories = len(story_fnames) with open(out_file, 'wb') as writer: for idx, s in enumerate(story_fnames): #print(s) # Look in the tokenized story dirs to find the .story file corresponding to this url if os.path.isfile(os.path.join(news_dir, s)): story_file = os.path.join(news_dir, s) article, abstract = self.get_art_abs(story_file) # Write to tf.Example if bytes(article, 'utf-8') == 0: print('error!') tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend( [bytes(article, 'utf-8')]) tf_example.features.feature[ 'abstract'].bytes_list.value.extend( [bytes(abstract, 'utf-8')]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, tf_example_str)) print("Finished writing file %s\n" % out_file) return story_fnames
class News_Generator: ''' Generate the news ''' def __init__(self): ''' filter: clean the text and use some magic regex remove_tag: skip these tags when generating news data_path: path of data directory analyzier: analyze the crawled article template: handle the taiwan journalist template ''' self.filter = ArticleFilter() self.remove_tag = ['公告', '協尋', '新聞'] self.data_path = os.path.join(os.getenv("DATA"), "raw" ) self.analyzier = Analyzier() self.template = Template() def find_and_generate(self, board='Gossiping', thr=10, index=-1): ''' Find the crawled ptt page and generate news Args: board: string, specify the ptt board thr: integer, the threshold of 'push' index: integer, specify the page Return: used_article: list, articles used to generate news article_urls: list, the urls linked to original article urls: list, the urls which appears in the article sumamry, response: list, the summary and key responses of articles titles: list, the title of news paragraphs: list, the paragraphs of news ''' print('Generate news from', board) used_article, article_urls, urls, titles, paragraphs = [], [], [], [], [] summarys, responses = [], [] articles = self.get_articles(board, index) for article in articles: try: push_num = article['Response_Count']['push'] except: push_num = 0 if push_num > thr: article_url, url, summary, response, title, paragraph = self.generate_news(article) if title != None and paragraph != None: used_article.append(article) titles.append(title) paragraphs.append(paragraph) urls.append(url) article_urls.append(article_url) summarys.append(summary) responses.append(response) return used_article, article_urls, urls, summarys, responses, titles, paragraphs def time_mapper(self, time): ''' Map the time into modern Mandarin Args: time: string Return: the Mandarin form of time ''' splited_time = list(map(int, time.split(':'))) return '{}點{}分{}秒'.format(splited_time[0],splited_time[1],splited_time[2]) def date_mapper(self, date): ''' Map the date into modern Mandarin Args: date: string Return: the Mandarin form of date ''' splited_date = date.split() month = {'Jan':'1', 'Feb':'2','Mar':'3', 'Apr':'4', 'May':'5', 'Jun':'6', 'Jul':'7', 'Aug':'8', 'Sep':'9', 'Oct':'10', 'Nov':'11', 'Dec':'12'} return '{}年{}月{}日'.format(splited_date[3], month[splited_date[1]], splited_date[2]) def generate_news(self, article): ''' Generate the news from article Args: article: dict, the crawled article Return: article_url: string, the url linked to article url: string, the urls which appears in the article all_sumamry, all_response: list, the summary and key responses of articles title: string, the title of news paragraph: string, the paragraphs of news ''' # Filter out some special article if article['Title'].startswith('Fw') or self.analyzier.check_article(article['Content']): return None, None, None, None, None, None # Split the tag and title tag, title = self.filter.get_tag(article['Title']) if tag in self.remove_tag: print('Tag {} is ignored!'.format(tag)) return None, None, None, None, None, None if article['Title'].startswith('Re'): tag = '回覆' # Get the template max_summary = self.analyzier.get_content_len(article['Content']) max_response = self.analyzier.get_response_num(article['Responses']) print('max sumamry:{}, max response:{}'.format(max_summary, max_response)) template = self.template.get_template(tag, max_summary, max_response) if template == None: print('No template!') return None, None, None, None, None, None # Clean author id author = article['Author'] author = re.sub('\(.*?\)', '', author).strip() # Deal with urls board = article['Board'] article_url = 'https://www.ptt.cc/bbs/' + article['Board'] + '/' + article['Article_id'] + '.html' url = {'article': self.analyzier.get_url(article['Content']), 'response':self.analyzier.get_response_url(article['Responses'])} print('url', url) # Extract summary and response summarys = self.analyzier.find_summary(article['Content'], template['summary_num']) responses = self.analyzier.find_useful_response(article['Responses'], template['comment_num']) print(responses) # Deal with the article date all_date = article['Date'].split() if len(all_date) < 5: # When the crawler failed, give special value time = '11:26:26' date = 'Thu Jul 20 2017' else: time = all_date.pop(3) date = ' '.join(all_date) time, date = self.time_mapper(time), self.date_mapper(date) # Fill the template title, paragraph = self.template.fill_template(template, date, time, title, author, board, summarys, responses) # Maybe we want the pure summary and key responses clean_content = self.analyzier.filter.clean_content(content=article['Content']) key_summary = self.analyzier.extract_key_sentences(clean_content, sort_by_index=True, num = 20) all_summary = [s[2] for s in key_summary] all_response = self.analyzier.find_useful_response(article['Responses'], 20) return article_url, url, all_summary, all_response, title, paragraph def get_articles(self, board, index=1): ''' Get the crawled page by the modified time Args: board: string, specify the board index: get index(st) page from the directory Return: articles ''' def get_pagenum(filename): return int(re.findall(r'\d+', filename)[0]) def get_modified(filename): return os.path.getctime(filename) path = os.path.join(self.data_path, board) filenames = [os.path.join(path, name) for name in os.listdir(path) if not name.startswith(".")] filenames = sorted(filenames, key=get_modified) file = filenames[index] print('check the existence of ', file) if os.path.exists(file): with open(file, 'r', encoding='utf-8') as f: articles = json.load(f) return articles else: print('No such file!') return []
title_cut = jieba.cut(article['Title'], cut_all=False) title_cut = ' '.join(title_cut).strip() content = re.sub('\n', ' ', article['Content']) content = re.sub('\ +', ' ', content) content_cut = jieba.cut(content, cut_all=False) content_cut = ' '.join(content_cut).strip() article = {'id': title_cut, 'text': content_cut, 'raw': article['Raw']} return article if __name__ == '__main__': data_path = os.getenv('DBDATA') ptt_filter = ArticleFilter() analyzier = Analyzier() articles = [] filenames = [ os.path.join(data_path, name) for name in os.listdir(data_path) if not name.startswith(".") ] print(filenames) for file in filenames: with open(file, 'r', encoding='utf-8') as data: articles += json.load(data) print(len(articles)) articles = ptt_filter.generate_corpus(articles) response_counter = {}