コード例 #1
0
ファイル: main.py プロジェクト: cilsat/perisalah-corpus
def main(argv):
    if len(argv) > 1:
        htmlist = argv[1]
    else:
        htmlist = 'htmlist'

    # Our permanent config for html cleaning
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = False
    config.fetch_images = False
    config.verbose= True

    cleaner = Article(url='', config=config)

    with open(htmlist, 'r') as f:
        htmfile = f.read().split('\n')

    raw = []

    for htm in htmfile:
        print (htm)
        if not htm.endswith("rss.html"):
            with open(htm, 'r') as f:
                h = f.read()

            cleaner.set_html(h)
            cleaner.parse()
            sentences = nlp.split_sentences(cleaner.text)
            #raw.append(sentences])
        
            with open('htm-out', 'a') as f:
                [f.write(r + '\n') for r in sentences]
コード例 #2
0
ファイル: metro.py プロジェクト: cilsat/koran-crawler
    def parse_article(self, response):
        # utilize newspaper for article parsing
        article = Article(url=response.url, config=self.config)
        article.set_html(response.body)

        article.parse()
        item = Art()
        item['title'] = article.title
        item['url'] = article.url
        item['text'] = '\n'.join(nlp.split_sentences(article.text.replace('\n', ' ')))
        yield item
コード例 #3
0
ファイル: republika.py プロジェクト: cilsat/koran-crawler
    def parse_article(self, response):
        # utilize newspaper for article parsing
        article = Article(url=response.url, config=self.config)
        article.set_html(response.body)

        article.parse()
        item = Art()
        item["title"] = article.title
        item["url"] = article.url
        item["text"] = "\n".join(nlp.split_sentences(article.text.replace("\n", " ")))
        yield item
コード例 #4
0
def _new_summarize(text='', max_sents=5):

    summaries = []
    sentences = split_sentences(text)
    keys = keywords(text)

    # Score sentences, and use the top 5 or max_sents sentences
    ranks = nlp.score(sentences, keys).most_common(max_sents)
    for rank in ranks:
        summaries.append(rank[0])
    summaries.sort(key=lambda summary: summary[0])
    return [summary[1] for summary in summaries]
コード例 #5
0
def _new_summarize( text='', max_sents=5):
    
    

    summaries = []
    sentences = split_sentences(text)
    keys = keywords(text)
 

    # Score sentences, and use the top 5 or max_sents sentences
    ranks = nlp.score(sentences, keys).most_common(max_sents)
    for rank in ranks:
        summaries.append(rank[0])
    summaries.sort(key=lambda summary: summary[0])
    return [summary[1] for summary in summaries]
コード例 #6
0
ファイル: tempo.py プロジェクト: cilsat/koran-crawler
    def parse_article(self, response):
        if len(response.body) > 0:
            # utilize newspaper for article parsing
            article = Article(url=response.url, config=self.config)
            article.set_html(response.body)
            article.parse()

            #self.sentences.append(nlp.split_sentences(article.text))
            
            item = Art()
            item['title'] = article.title
            item['url'] = article.url
            item['text'] = '\n'.join(nlp.split_sentences(article.text.replace('\n', ' ')))
            yield item
        else:
            print response.url + ' DEAD LINK'
コード例 #7
0
    def summarize(self, html, percent_sentences):
        if (percent_sentences is None or percent_sentences > 100
                or percent_sentences < 0):
            percent_sentences = 15

        article = self.process_html(html)

        # remove title from the text, if it appears in the text
        if article.text.startswith(article.title):
            article.set_text(article.text[len(article.title):])

        sentences = nlp.split_sentences(article.text)
        log.debug(article.text)

        # remove punctuations, numbers and special characters
        clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
        clean_sentences = [s.lower() for s in clean_sentences]
        clean_sentences = [
            self._remove_stopwords(r.split()) for r in clean_sentences
        ]

        # create sentence vectors
        sentence_vectors = []
        for i in clean_sentences:
            if len(i) != 0:
                v = sum([
                    self.word_embeddings.get(w, np.zeros((300, )))
                    for w in i.split()
                ]) / (len(i.split()) + 0.001)
            else:
                v = np.zeros((300, ))
            sentence_vectors.append(v)

        # similarity matrix
        sim_mat = np.zeros([len(sentences), len(sentences)])

        # initialize matrix
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    sim_mat[i][j] = cosine_similarity(
                        sentence_vectors[i].reshape(1, 300),
                        sentence_vectors[j].reshape(1, 300),
                    )[0, 0]

        # convert matrix into graph
        nx_graph = nx.from_numpy_array(sim_mat)
        textrank_scores = self.normalize_scores(nx.pagerank(nx_graph))

        # get newspaper's nlp scores
        # https://github.com/codelucas/newspaper/blob/master/newspaper/article.py#L372
        nlp.load_stopwords(article.config.get_language())

        # call to: nlp.summarize(title=article.title, text=article.text, max_sents=max_sents)
        # https://github.com/codelucas/newspaper/blob/master/newspaper/nlp.py#L40
        title_words = nlp.split_words(article.title)
        most_frequent = nlp.keywords(article.text)

        nlp_scores = self.normalize_scores(
            nlp.score(sentences, title_words, most_frequent))

        totalled_scores = Counter()
        for key, value in nlp_scores.items():
            totalled_scores[key[0]] += value

        for key, value in textrank_scores.items():
            totalled_scores[key] += value

        num_sentences = int(len(clean_sentences) * percent_sentences / 100)
        sentence_indices = list(
            map(lambda x: x[0], totalled_scores.most_common(num_sentences)))

        return list(map(lambda x: sentences[x], sentence_indices))