Esempio n. 1
0
 def __init__(self):
     '''
     tr4s: extract keysentences
     tr4w: extract keywords
     filter: clean the text and use some magic regex
     '''
     self.tr4s = TextRank4Sentence(stop_words_file=stopword_path)
     self.tr4w = TextRank4Keyword(stop_words_file=stopword_path)
     self.filter = ArticleFilter()
Esempio n. 2
0
 def __init__(self):
     self.filter = ArticleFilter()
     self.SENTENCE_START = '<s>'
     self.SENTENCE_END = '</s>'
     dm_single_close_quote = u'\u2019'  # unicode
     dm_double_close_quote = u'\u201d'
     self.END_TOKENS = [
         '.', '!', '?', '...', "'", "`", '"', dm_single_close_quote,
         dm_double_close_quote, ")"
     ]
Esempio n. 3
0
 def __init__(self):
     '''
     filter: clean the text and use some magic regex
     remove_tag: skip these tags when generating news
     data_path: path of data directory
     analyzier: analyze the crawled article
     template: handle the taiwan journalist template
     '''
     self.filter = ArticleFilter()
     self.remove_tag = ['公告', '協尋', '新聞']
     self.data_path = os.path.join(os.getenv("DATA"), "raw" ) 
     self.analyzier = Analyzier()
     self.template = Template()
Esempio n. 4
0
import os
import sys
import jieba
import schedule
import time
import subprocess
import re
import os
#from db.db import DB

dict_path = os.path.join(os.getenv("JIEBA_DATA"), "dict.txt.big") 
jieba.set_dictionary(dict_path)
#db = DB(os.getenv('DATA'))
crawler = PttWebCrawler()
analyzier = Analyzier()
Filter = ArticleFilter()
interface = Interface()
engine = SearchEngine(os.getenv('DOC'), os.getenv('TFIDF_DATA'))


def get_url(texts):
    for text in texts:
        urls = analyzier.get_url(text)
        image_urls = []
        for url in urls:
            image_url = analyzier.open_url(url)
            if image_url != None:
                return image_url


def generate_post(board, article, summary, response, title, paragraph, article_url, url, image_url):
Esempio n. 5
0
class Analyzier:
    ''' Analyze the ptt article '''
    def __init__(self):
        '''
        tr4s: extract keysentences
        tr4w: extract keywords
        filter: clean the text and use some magic regex
        '''
        self.tr4s = TextRank4Sentence(stop_words_file=stopword_path)
        self.tr4w = TextRank4Keyword(stop_words_file=stopword_path)
        self.filter = ArticleFilter()

    def get_url(self, content):
        ''' 
        get the url from content
        Args: 
            url: string
        Return: 
            list of string
        '''
        return self.filter.get_url(content)

    def get_content_len(self, content):
        '''
        get the length of content
        Args: 
            content: string
        Return: 
            integer
        '''
        clean_content = self.filter.clean_content(content=content)
        return len(clean_content.split('\n'))

    def get_response_num(self, responses):
        '''
        get the number of useful responses
        Args: 
            responses: list of dict
        Return: 
            integer
        '''
        clean_responses = self.filter.clean_responses(responses,
                                                      self.filter.stopwords)
        print('Response: {}, Clean response: {}'.format(
            len(responses), len(clean_responses)))
        return len(clean_responses)

    def get_response_url(self, responses):
        '''
        get all url from the responses
        Args: 
            responses: list of dict
        Return: 
            list of string
        '''
        urls = []
        for response in responses:
            content = re.sub('\ +', '//', response['Content'])
            urls += self.filter.get_url(response['Content'])
            urls += self.filter.get_url(content)
        return list(set(urls))

    def check_article(self, content):
        '''
        check whether the article is 'Give P Coin' article
        Args: 
            content: string
        Return: 
            bool
        '''
        pattern = '[0-9]+?P'
        reward = re.findall(pattern, content)
        if len(reward) > 0:
            return True
        else:
            return False

    def open_url(self, url):
        '''
        Try to get the image url from the input url
        If the input url is not a image and open graph protocal
        gives nothing, it returns 'None'
        Args: 
            url: string
        Return: 
            string or None
        '''
        def get_type(url):
            print(url)
            try:
                with urllib.request.urlopen(url) as response:
                    mime_type = response.info().get_content_type()
                print(mime_type)
                return mime_type.split('/')
            except:
                return [None, None]

        if get_type(url)[0] == 'image':
            return url
        else:
            print('try og')
            try:
                page = metadata_parser.MetadataParser(url=url)
                image_link = page.get_metadata_link('image')
                if image_link != None:
                    #image_url.append(image_link)
                    return image_link
            except:
                return None

    def find_summary(self, content, summary_num=5, debug=True):
        '''
        generate the summary from input content
        Args: 
            content: string
            summary_num: integer, how many summary you want
            debug: bool, whether to print the result
        Return:
            list of string
        '''
        clean_content = self.filter.clean_content(content=content)

        # at most extract (content / 5) sentences from content
        max_num = int(len(clean_content.split('\n')) / 5)
        num = max_num if max_num > summary_num * 2 else summary_num * 2
        if len(clean_content) < 150:  # if the article is short enough
            num = 1e6
        if num > 20:  # if the article is too long
            num = 20

        key_sentences = self.extract_key_sentences(clean_content,
                                                   sort_by_index=True,
                                                   num=num)
        key_sentences = [x[2] for x in key_sentences]
        if debug:
            print('Original content:', content)
            print('Cleaned content:', clean_content)
            print('Length of cleaned content:', len(clean_content))
            print('Key sentences:', key_sentences)
            print('Num of key sentences', len(key_sentences))
        '''
        Divide the keysentences into $summary_num part
        '''
        summarys = []
        summary_len = [1 for _ in range(summary_num)
                       ]  # each part deserve 1 sentence
        rest = len(key_sentences) - summary_num  # num of remain sentences
        if rest > summary_num:
            # equally distribute the key sentences to all part
            factor = int(rest / summary_num)
            rest = rest - (factor * summary_num)
            summary_len = [x + factor for x in summary_len]
        rest_count = 0
        for i in range(len(summary_len)):
            if rest > 0:
                # assign the remain sentences if we have
                summary_len[i] += 1
                rest -= 1
            summarys.append(','.join(key_sentences[rest_count:rest_count +
                                                   summary_len[i]]))
            rest_count += summary_len[i]

        return summarys

    def find_useful_response(self, responses, num=5):
        clean_responses = self.filter.clean_responses(responses,
                                                      self.filter.stopwords)
        if len(clean_responses) != 0:
            responses = clean_responses
        ''' 
        preserve the original responses
        merge the all responses into one article
        '''
        response_dict = {}
        all_response = ''
        for response in responses:
            all_response += response['Content'].replace(' ', '') + '\n'
            response_dict[response['Content'].replace(' ',
                                                      '').strip()] = response

        # run text rank
        key_responses = self.extract_key_sentences(all_response,
                                                   sort_by_index=False,
                                                   num=num)
        important_responses = []

        # restore the responses
        for r in key_responses:
            if r[2].strip() in response_dict.keys():
                response = response_dict[r[2].strip()]
                author, content = response['User'], response['Content']
                ipdatetime, vote = response['Ipdatetime'], response['Vote']
            else:
                author, content = 'unk', r[2]
                ipdatetime, vote = 'unk', 'unk'
            content = re.sub('\ +', ',', content)
            important_responses.append({
                'author': author,
                'content': content,
                'vote': vote,
                'ipdatetime': ipdatetime
            })
        return important_responses

    def extract_keywords(self, content):
        '''
        extract the keywords from content
        Args:
            content: string
        Return:
            key_words: list of string
        '''
        clean_content = self.filter.clean_content(content=content)
        self.tr4w.analyze(text=clean_content, lower=True, window=2)
        key_words = []
        for item in self.tr4w.get_keywords(20, word_min_len=1):
            #print(item.word, item.weight)
            key_words.append(item.word)
        return key_words

    def extract_key_sentences(self, content, sort_by_index=False, num=5):
        '''
        extract keysentences from content
        Args:
            content: string
            sort_by_index: bool, whether sort the output by line index
            num: integer, how many sentences we want
        Return:
            list of information of key_sentences
        '''
        self.tr4s.analyze(text=content, lower=True, source='all_filters')
        key_sentences = []
        for item in self.tr4s.get_key_sentences(num=num, sentence_min_len=1):
            key_sentences.append([item.index, item.weight, item.sentence])
            #print(item.index, item.weight, item.sentence)
        #print('=====')
        def index(x):
            return x[0]

        def weight(x):
            return x[1]

        return sorted(key_sentences, key=index) if sort_by_index else sorted(
            key_sentences, key=weight)
Esempio n. 6
0
class Interface:
    def __init__(self):
        self.filter = ArticleFilter()
        self.SENTENCE_START = '<s>'
        self.SENTENCE_END = '</s>'
        dm_single_close_quote = u'\u2019'  # unicode
        dm_double_close_quote = u'\u201d'
        self.END_TOKENS = [
            '.', '!', '?', '...', "'", "`", '"', dm_single_close_quote,
            dm_double_close_quote, ")"
        ]

    def prepare_news(self, content, path):
        content = self.filter.clean_content(content)
        content_cutted = ' '.join(jieba.cut(content.strip(), cut_all=False))
        content_splitted = re.sub('\ +', '\n', content_cutted).strip()
        content_splitted = re.sub('\n+', '\n', content_splitted)
        with open(path, 'w') as f:
            f.write(content_splitted + '\n')

    def read_text_file(self, text_file):
        lines = []
        with open(text_file, "r") as f:
            for line in f:
                if len(line.strip()) == 0:
                    continue
                lines.append(line.strip())
        return lines

    def fix_missing_period(self, line):
        """Adds a period to a line that is missing a period"""
        if "@highlight" in line: return line
        if line == "": return line
        if line[-1] in self.END_TOKENS: return line
        # print line[-1]
        return line + " ."

    def chunk_file(self, in_file, chunks_dir):
        reader = open(in_file, "rb")
        chunk = 0
        finished = False
        while not finished:
            chunk_fname = os.path.join(chunks_dir, '%s_%03d.bin' %
                                       ('test', chunk))  # new chunk
            with open(chunk_fname, 'wb') as writer:
                for _ in range(1000):
                    len_bytes = reader.read(8)
                    if not len_bytes:
                        finished = True
                        break
                    str_len = struct.unpack('q', len_bytes)[0]
                    example_str = struct.unpack('%ds' % str_len,
                                                reader.read(str_len))[0]
                    writer.write(struct.pack('q', str_len))
                    writer.write(struct.pack('%ds' % str_len, example_str))
                chunk += 1

    def get_art_abs(self, story_file):
        lines = self.read_text_file(story_file)

        # Lowercase everything
        lines = [line.lower() for line in lines]

        # Put periods on the ends of lines that are missing them (this is a problem in the dataset because many image captions don't end in periods; consequently they end up in the body of the article as run-on sentences)
        lines = [self.fix_missing_period(line) for line in lines]

        # Separate out article and abstract sentences
        article_lines = []
        highlights = []
        next_is_highlight = False
        for idx, line in enumerate(lines):
            if line == "":
                continue  # empty line
            elif line.startswith("@highlight"):
                next_is_highlight = True
            elif next_is_highlight:
                highlights.append(line)
            else:
                article_lines.append(line)

        # Make article into a single string
        article = ' '.join(article_lines)

        # Make abstract into a signle string, putting <s> and </s> tags around the sentences
        abstract = ' '.join([
            "%s %s %s" % (self.SENTENCE_START, sent, self.SENTENCE_END)
            for sent in highlights
        ])
        if len(highlights) == 0:
            abstract = 'test'
        #print(abstract)
        return article, abstract

    def write_to_bin(self, news_dir, out_file):
        story_fnames = os.listdir(news_dir)
        num_stories = len(story_fnames)

        with open(out_file, 'wb') as writer:
            for idx, s in enumerate(story_fnames):
                #print(s)
                # Look in the tokenized story dirs to find the .story file corresponding to this url
                if os.path.isfile(os.path.join(news_dir, s)):
                    story_file = os.path.join(news_dir, s)
                article, abstract = self.get_art_abs(story_file)

                # Write to tf.Example
                if bytes(article, 'utf-8') == 0:
                    print('error!')
                tf_example = example_pb2.Example()
                tf_example.features.feature['article'].bytes_list.value.extend(
                    [bytes(article, 'utf-8')])
                tf_example.features.feature[
                    'abstract'].bytes_list.value.extend(
                        [bytes(abstract, 'utf-8')])
                tf_example_str = tf_example.SerializeToString()
                str_len = len(tf_example_str)
                writer.write(struct.pack('q', str_len))
                writer.write(struct.pack('%ds' % str_len, tf_example_str))

        print("Finished writing file %s\n" % out_file)
        return story_fnames
Esempio n. 7
0
class News_Generator:
    ''' Generate the news '''
    def __init__(self):
        '''
        filter: clean the text and use some magic regex
        remove_tag: skip these tags when generating news
        data_path: path of data directory
        analyzier: analyze the crawled article
        template: handle the taiwan journalist template
        '''
        self.filter = ArticleFilter()
        self.remove_tag = ['公告', '協尋', '新聞']
        self.data_path = os.path.join(os.getenv("DATA"), "raw" ) 
        self.analyzier = Analyzier()
        self.template = Template()

    def find_and_generate(self, board='Gossiping', thr=10, index=-1):
        '''
        Find the crawled ptt page and generate news
        Args:
            board: string, specify the ptt board
            thr: integer, the threshold of 'push'
            index: integer, specify the page
        Return:
            used_article: list, articles used to generate news
            article_urls: list, the urls linked to original article
            urls: list, the urls which appears in the article
            sumamry, response: list, the summary and key responses of articles
            titles: list, the title of news
            paragraphs: list, the paragraphs of news
        '''
        print('Generate news from', board)
        used_article, article_urls, urls, titles, paragraphs = [], [], [], [], []
        summarys, responses = [], []

        articles = self.get_articles(board, index)
        for article in articles:
            try:
                push_num = article['Response_Count']['push']
            except:
                push_num = 0
            if push_num > thr:
                article_url, url, summary, response, title, paragraph = self.generate_news(article)
                if title != None and paragraph != None:
                    used_article.append(article)
                    titles.append(title)
                    paragraphs.append(paragraph)
                    urls.append(url)
                    article_urls.append(article_url)
                    summarys.append(summary)
                    responses.append(response)
        return used_article, article_urls, urls, summarys, responses, titles, paragraphs

    def time_mapper(self, time):
        '''
        Map the time into modern Mandarin
        Args:
            time: string
        Return:
            the Mandarin form of time
        '''
        splited_time = list(map(int, time.split(':')))
        return '{}點{}分{}秒'.format(splited_time[0],splited_time[1],splited_time[2])

    def date_mapper(self, date):
        '''
        Map the date into modern Mandarin
        Args:
            date: string
        Return:
            the Mandarin form of date
        '''
        splited_date = date.split()
        month = {'Jan':'1', 'Feb':'2','Mar':'3',
                'Apr':'4', 'May':'5', 'Jun':'6',
                'Jul':'7', 'Aug':'8', 'Sep':'9',
                'Oct':'10', 'Nov':'11', 'Dec':'12'}
        return '{}年{}月{}日'.format(splited_date[3],
                                    month[splited_date[1]],
                                    splited_date[2])


    def generate_news(self, article):
        '''
        Generate the news from article
        Args:
            article: dict, the crawled article
        Return:
            article_url: string, the url linked to article
            url: string, the urls which appears in the article
            all_sumamry, all_response: list, the summary and key responses of articles
            title: string, the title of news
            paragraph: string, the paragraphs of news
        '''
        # Filter out some special article
        if article['Title'].startswith('Fw') or self.analyzier.check_article(article['Content']):
            return None, None, None, None, None, None

        # Split the tag and title
        tag, title = self.filter.get_tag(article['Title'])
        if tag in self.remove_tag:
            print('Tag {} is ignored!'.format(tag))
            return None, None, None, None, None, None
        if article['Title'].startswith('Re'):
            tag = '回覆'

        # Get the template
        max_summary = self.analyzier.get_content_len(article['Content'])
        max_response = self.analyzier.get_response_num(article['Responses'])
        print('max sumamry:{}, max response:{}'.format(max_summary, max_response))
        template = self.template.get_template(tag, max_summary, max_response)
        if template == None:
            print('No template!')
            return None, None, None, None, None, None
        
        # Clean author id
        author = article['Author']
        author = re.sub('\(.*?\)', '', author).strip()

        # Deal with urls
        board = article['Board']
        article_url = 'https://www.ptt.cc/bbs/' + article['Board'] + '/' + article['Article_id'] + '.html'
        url = {'article': self.analyzier.get_url(article['Content']), 'response':self.analyzier.get_response_url(article['Responses'])}
        print('url', url)

        # Extract summary and response
        summarys = self.analyzier.find_summary(article['Content'], template['summary_num'])
        responses = self.analyzier.find_useful_response(article['Responses'], template['comment_num'])
        print(responses)

        # Deal with the article date
        all_date = article['Date'].split()
        if len(all_date) < 5:
            # When the crawler failed, give special value
            time = '11:26:26'
            date = 'Thu Jul 20 2017'
        else:
            time = all_date.pop(3)
            date = ' '.join(all_date)
        time, date = self.time_mapper(time), self.date_mapper(date)
        
        # Fill the template
        title, paragraph = self.template.fill_template(template, date, time, title, author, board, summarys, responses)

        # Maybe we want the pure summary and key responses
        clean_content = self.analyzier.filter.clean_content(content=article['Content'])
        key_summary = self.analyzier.extract_key_sentences(clean_content, sort_by_index=True, num = 20)
        all_summary = [s[2] for s in key_summary]
        all_response = self.analyzier.find_useful_response(article['Responses'], 20)

        return article_url, url, all_summary, all_response, title, paragraph
    
    def get_articles(self, board, index=1):
        '''
        Get the crawled page by the modified time
        Args:
            board: string, specify the board
            index: get index(st) page from the directory
        Return:
            articles
        '''
        def get_pagenum(filename):
            return int(re.findall(r'\d+', filename)[0])
        def get_modified(filename):
            return os.path.getctime(filename)
        path = os.path.join(self.data_path, board)
        filenames = [os.path.join(path, name) for name in os.listdir(path) if not name.startswith(".")]
        filenames = sorted(filenames, key=get_modified)
        file = filenames[index]
        print('check the existence of ', file)
        if os.path.exists(file):
            with open(file, 'r', encoding='utf-8') as f:
                articles = json.load(f)
            return articles
        else:
            print('No such file!')
            return []
Esempio n. 8
0
    title_cut = jieba.cut(article['Title'], cut_all=False)
    title_cut = ' '.join(title_cut).strip()

    content = re.sub('\n', ' ', article['Content'])
    content = re.sub('\ +', ' ', content)
    content_cut = jieba.cut(content, cut_all=False)
    content_cut = ' '.join(content_cut).strip()

    article = {'id': title_cut, 'text': content_cut, 'raw': article['Raw']}
    return article


if __name__ == '__main__':
    data_path = os.getenv('DBDATA')

    ptt_filter = ArticleFilter()
    analyzier = Analyzier()

    articles = []
    filenames = [
        os.path.join(data_path, name) for name in os.listdir(data_path)
        if not name.startswith(".")
    ]
    print(filenames)
    for file in filenames:
        with open(file, 'r', encoding='utf-8') as data:
            articles += json.load(data)
    print(len(articles))
    articles = ptt_filter.generate_corpus(articles)

    response_counter = {}