def test_add_node(): graph = Graph() graph.add_node(Node('a')) assert Node('a') in graph._adjacency_list.keys() assert len(graph._adjacency_list.keys()) == 1 assert len(graph._adjacency_list[Node('a')]) == 0
def test_get_nodes(): graph = Graph() graph.add_node(Node('a')) graph.add_node(Node('b')) nodes = graph.get_nodes() assert len(nodes) == 2 assert Node('a') in nodes and Node('b') in nodes
def test_add_duplicate_edge(): graph = Graph() graph.add_edge(Node('a'), Node('b')) graph.add_edge(Node('a'), Node('b')) assert Node('a') in graph._adjacency_list.keys() assert Node('b') in graph._adjacency_list.keys() assert len(graph._adjacency_list.keys()) == 2 assert len(graph._adjacency_list[Node('a')]) == 1 assert len(graph._adjacency_list[Node('b')]) == 0
def test_get_connected_from(): graph = Graph() graph.add_edge(Node('a'), Node('b')) graph.add_edge(Node('a'), Node('c')) assert graph.get_nodes_count() == 3 assert graph.get_edges_count() == 2 connected_from = graph.get_connected_from(Node('a')) assert len(connected_from) == 2 assert Node('b') in connected_from assert Node('c') in connected_from
def test_rank(): sentence_list = sorted([ 'This is a sentence.', 'A quick brown fox.', 'Jumped over the lazy dog.' ]) sentence_nodes = sentences.rank( [Node(s, score=sentences.DEFAULT_NODE_SCORE) for s in sentence_list]) sentence_nodes = sorted(sentence_nodes, key=lambda n: n.data) # Here we expect that each sentence node will have the same score. This is # because the 'clean' (non-stop) words in each sentence have no similarity # with any other sentence. for i in range(len(sentence_list)): assert sentence_nodes[i].data == sentence_list[i] assert sentence_nodes[i].score == sentences.DEFAULT_NODE_SCORE
def test_get_nodes_count(): graph = Graph() graph.add_node(Node('a')) assert graph.get_nodes_count() == 1
def test_get_averaged_score_data(): node = Node('a', score=10) assert len(node.variations) == 0 assert node.get_averaged_score() == 10.0
def test_get_averaged_scorevariations(): node = Node('a', score=10) node.variations = ['v1', 'v2'] assert node.get_averaged_score() == 5.0
def test_equality(): node_a = Node('a') node_b = Node('a') assert node_a == node_b
def test_remove_variation(): node = Node('a') node.variations = ['v1', 'v2'] node.remove_variation('v1') assert set(node.variations) == set(['v2'])
def test_add_variation(): node = Node('a') node.add_variation('v') assert set(node.variations) == set(['v'])
def test_inequality(): node_a = Node('a') node_b = Node('b') assert node_a != node_b
def dl_techcrunch(session, stemmer, year, month, day): base_url = 'https://techcrunch.com/' archive_url = '{}{}/{}/{}'.format(base_url, year, month, day) # Download links for the articles. response = requests.get(archive_url) if not 200 <= response.status_code < 300: print('techcrunch failed to download archive page: status {}'.format( response.status_code)) return False html = BeautifulSoup(response.text, 'html.parser') post_titles = html.select('.post-title') article_urls = [] for post_title in post_titles: url = post_title.select('a')[0].attrs['href'] if url.replace('www.', '').startswith(base_url): article_urls.append(url) # Download, summarise and store articles. for url in article_urls: # Check if we already have the article in the DB. if _get_article(session, normalize_url(url)): print('~ {}'.format(normalize_url(url))) continue tc_article = techcrunch.ArticleLoader.load(url) # Download the article content. try: article = _download_article(url) # Use for now until summaries and loaders are better. article.nlp() except newspaper.article.ArticleException: print('- {}'.format(normalize_url(url))) # Normalise and hash all the sentences to make finding the index more # accurate. text_sentences = [ _hash_text(s) for s in tokenize_sentences(article.text)] # Conform data for entry into DB. summary_sentences = [] keywords = [Node(w) for w in article.keywords] for tag in tc_article.get('tags', []): if tag not in article.keywords: keywords.append(Node(tag)) for sentence in article.summary.split('\n'): index = text_sentences.index(_hash_text(sentence)) summary_sentences.append(Node(sentence, index=index)) # Insert article and summary into DB. dao.article.insert( session=session, text=tc_article.get('content', article.text), url=normalize_url(url), title=tc_article.get('title', article.title), keywords=keywords, sentences=summary_sentences, published_at=_format_timestamp(year, month, day), s_analysis=None, ) print('+ {}'.format(url)) return True
def dl_hackernoon(session, stemmer, year, month, day): base_url = 'https://hackernoon.com/' archive_url = '{}archive/{}/{}/{}'.format(base_url, year, month, day) # Download links for the articles. response = requests.get(archive_url) if not 200 <= response.status_code < 300: print('hackernoon failed to download archive page: status {}'.format( response.status_code)) return False html = BeautifulSoup(response.text, 'html.parser') post_title_list = html.select('div.js-postStream') article_urls = [] if post_title_list: for anchor in post_title_list[0].select('a'): url = anchor.attrs['href'] if url.replace('www.', '').startswith(base_url): if 'source=collection_archive' in url: url = url.split('?')[0] match = re.match('[a-z0-9]+', url.split('-')[-1]) if match and url not in article_urls and '@' not in url: article_urls.append(url) # Download, summarise and store articles. for url in article_urls: # Check if we already have the article in the DB. if _get_article(session, normalize_url(url)): print('~ {}'.format(normalize_url(url))) continue # Download the article content. try: article = _download_article(url) # Use for now until summaries and loaders are better. article.nlp() except newspaper.article.ArticleException: print('- {}'.format(normalize_url(url))) hn_article = hackernoon.ArticleLoader.load(url) # Normalise and hash all the sentences to make finding the index more # accurate. text_sentences = [ _hash_text(s) for s in tokenize_sentences(article.text)] # Conform data for entry into DB. summary_sentences = [] keywords = [Node(w) for w in article.keywords] for sentence in article.summary.split('\n'): index = text_sentences.index(_hash_text(sentence)) summary_sentences.append(Node(sentence, index=index)) # Insert article and summary into DB. dao.article.insert( session=session, text=hn_article.get('content', article.text), url=normalize_url(url), title=hn_article.get('title', article.title), keywords=keywords, sentences=summary_sentences, published_at=_format_timestamp(year, month, day), s_analysis=None, ) print('+ {}'.format(url)) return True
def has_node(self, data): '''Return true if there exists a node with the specified data.''' return Node(data) in self._adjacency_list
def test_get_edges_count(): graph = Graph() graph.add_edge(Node('a'), Node('b')) assert graph.get_nodes_count() == 2 assert graph.get_edges_count() == 1
def test_has_node(): graph = Graph() graph.add_node(Node('a')) assert graph.has_node('a') assert not graph.has_node('b')
def test_has_variation(): node = Node('a') node.variations = ['v'] assert node.has_variation('v') assert not node.has_variation('w')
def test_get_variations(): node = Node('a') node.variations = ['v'] assert set(node.get_variations()) == set(['v'])
def _summarize(text='', title='', url='', sentence_count=DEFAULT_SENTENCE_COUNT, suggestedKeywords=None, keyword_count=DEFAULT_KEYWORD_COUNT): article_data = {'title': title, 'text': text, 'url': url} if url: # Check if article is cached article = _get_summary(normalize_url(url)) if article: if suggestedKeywords is not None: for word in suggestedKeywords: newKeyword = Keyword(word, 1) if newKeyword not in article.keywords: article.keywords.insert(0, newKeyword) db.session.add(article) db.session.commit() if article.s_analysis is None: senti_analysis_data = sentiment.SentimentAnalysis.analyise( article.text) if senti_analysis_data is not None: article.s_analysis = senti_analysis_data['label'] return { 'title': article.title, 'text': article.text, 'sentences': [s.data for s in article.sentences][:sentence_count], 'keywords': [w.data for w in article.keywords][:keyword_count], 's_analysis': article.s_analysis, } article = _get_article_from_url(url) article_data['text'] = article.text if 'techcrunch' in url: tc_article = techcrunch.ArticleLoader.load(url) article_data['title'] = tc_article['title'] article_data['text'] = tc_article['content'] article_data['published_at'] = tc_article['timestamp'] senti_analysis_data = sentiment.SentimentAnalysis.analyise( tc_article['content']) print(senti_analysis_data) if senti_analysis_data is not None: article_data['s_analysis'] = senti_analysis_data['label'] elif 'wired' in url: wired_article = wired.ArticleLoader.load(url) article_data['title'] = wired_article['title'] article_data['text'] = wired_article['content'] article_data['published_at'] = wired_article['date'] senti_analysis_data = sentiment.SentimentAnalysis.analyise( wired_article['content']) if senti_analysis_data is not None: article_data['s_analysis'] = senti_analysis_data['label'] elif 'hackernoon' in url: hackernoon_article = hackernoon.ArticleLoader.load(url) article_data['title'] = hackernoon_article['title'] article_data['text'] = hackernoon_article['content'] article_data['published_at'] = hackernoon_article['date'] senti_analysis_data = sentiment.SentimentAnalysis.analyise( hackernoon_article['content']) if senti_analysis_data is not None: article_data['s_analysis'] = senti_analysis_data['label'] elif 'venturebeat' in url: venturebeat_article = venturebeat.ArticleLoader.load(url) article_data['title'] = venturebeat_article['title'] article_data['text'] = venturebeat_article['content'] article_data['published_at'] = venturebeat_article['date'] senti_analysis_data = sentiment.SentimentAnalysis.analyise( venturebeat_article['content']) if senti_analysis_data is not None: article_data['s_analysis'] = senti_analysis_data['label'] elif 'news.com.au' in url: newsau_article = newsau.ArticleLoader.load(url) article_data['title'] = newsau_article['title'] article_data['text'] = newsau_article['content'] article_data['published_at'] = newsau_article['date'] senti_analysis_data = sentiment.SentimentAnalysis.analyise( newsau_article['content']) if senti_analysis_data is not None: article_data['s_analysis'] = senti_analysis_data['label'] elif 'cricket' in url: ca_article = cricketau.ArticleLoader.load(url) article_data['title'] = ca_article['title'] article_data['text'] = ca_article['content'] print(ca_article['content']) article_data['published_at'] = ca_article['date'] senti_analysis_data = sentiment.SentimentAnalysis.analyise( ca_article['content']) if senti_analysis_data is not None: article_data['s_analysis'] = senti_analysis_data['label'] else: article_data['title'] = article.title senti_analysis_data = sentiment.SentimentAnalysis.analyise( article.text) if senti_analysis_data is not None: article_data['s_analysis'] = senti_analysis_data['label'] else: article_data['title'] = title senti_analysis_data = sentiment.SentimentAnalysis.analyise(text) if senti_analysis_data is not None: article_data['s_analysis'] = senti_analysis_data['label'] sentence_nodes = [] sentences = tokenize_sentences(article_data['text']) for i, data in enumerate(sentences): sentence_nodes.append(Node(data, index=i)) ranked_sentences = sorted(rank_sentences(sentence_nodes), key=lambda n: n.score, reverse=True) keywords = rank_words(article_data['title'], article_data['text']) if suggestedKeywords is not None: for word in suggestedKeywords: newKeyword = Keyword(word, 1) if newKeyword not in article.keywords: article.keywords.insert(0, newKeyword) keywords.insert(0, newKeyword) if url: _insert_summary(title=article_data['title'], text=article_data['text'], url=normalize_url(url), keywords=keywords, sentences=ranked_sentences, published_at=article_data.get('published_at'), s_analysis=article_data['s_analysis']) return { 'title': article_data['title'], 'text': article_data['text'], 'sentences': [node.data for node in ranked_sentences][:sentence_count], 'keywords': [node.data for node in keywords][:keyword_count], 's_analysis': article_data['s_analysis'] }