def create_article_obj(self, url, feed_id, feed_content=None): if feed_content is not None: parsed_text = self.parse_feed_text(feed_content) article_obj = Article(url, feed_id, article_text=parsed_text) else: article_obj = Article(url, feed_id) return article_obj
def setup_test(self): ''' Make ourselves a way to quickly setup articles storage. ''' self.article_0 = Article(0, "Some title 0", "2016-09-22", "Some body text 0", ['tag0', 'tag1', 'tag2', 'tag3']) self.article_1 = Article(1, "Some title 1", "2016-09-22", "Some body text 1", ['tag0', 'tag4', 'tag5', 'tag6']) self.article_2 = Article(2, "Some title 2", "2016-09-23", "Some body text 2", ['tag0', 'tag1', 'tag2', 'tag3']) self.article_3 = Article(3, "Some title 3", "2016-09-23", "Some body text 3", ['tag0', 'tag1', 'tag2', 'tag3']) self.article_4 = Article(4, "Some title 4", "2016-09-23", "Some body text 4", ['tag0', 'tag1', 'tag2', 'tag3']) self.articles = Articles()
def get_article_full(alias): ''' Return full article contents. ''' db = get_db() article = Article(alias) article.load_all_data(db) db.commit() return article
def article(id): if not id or id == "undefined": return url_for("index") else: try: article = Article(os.path.join(ARTICLES_PATH, id + ".json")) except FileNotFoundError: return redirect(url_for("index")) return render_template("article.html", article=article)
def find_translation(article, lang, cur): src_url = find_translation_url(article, lang) if not src_url: return None m = id_pattern.match(src_url) if m: aid = int(m.group(1)) cur.execute('select * from articles where id = ?', (aid, )) else: cur.execute('select * from articles where url = ?', (src_url, )) row = cur.fetchone() if row is not None: return Article(*row)
def post(self): '''Endpoint POST /articles should handle the receipt of some article data in json format, and store it within the service. ''' args = self.parser.parse_args() self.abort_if_article_aleady_exist(args['id']) article = Article(args['id'], args['title'], args['date'], args['body'], args['tags']) result = self.storage.add(article) return jsonpickle.encode(result, unpicklable=False)
def main(): parser = argparse.ArgumentParser(description='Dump all text') parser.add_argument('database', help='database to read articles from') parser.add_argument('lang', help='language to get articles for') args = parser.parse_args() conn = sqlite3.connect(args.database) cur = conn.cursor() cur.execute('select * from articles where lang = ?', (args.lang,)) for article in cur.fetchall(): article = Article(*article) print(article.entry.encode('utf8'))
def main(filename): Base.metadata.create_all(engine) session = Session() articles = pd.read_csv(filename) for index, row in articles.iterrows(): logger.info("Loading article into DB") article = Article(row["uid"], row["body"], row["host"], row["newspaper_uid"], row["n_tokens_title"], row["title"], row["url"]) session.add(article) session.commit() session.close()
def generate_article(): keywords = request.form.get("topic") if keywords == None: return render_template("home.html") else: keywords = keywords.split(" ") kwords = [] for word in keywords: kwords.append(word.lower()) keywords = kwords articles = [] for file in os.listdir("articles/"): if file.endswith(".txt"): text = open(os.path.join("articles/", file), "r").read() source = file[:file.index("-")] articles.append(Article(text, source)) weighted_articles = [] for art in articles: weighted_articles.append((similarity(art.vector, keywords), art)) weighted_articles = sorted(weighted_articles, key=lambda x: -x[0]) temp = [] for pair in weighted_articles: if pair[0] > 0: temp.append(pair) weighted_articles = temp if len(weighted_articles) >= 3: model = weighted_articles[0:3] else: model = weighted_articles articles = [] for pair in model: art = pair[1] articles.append(art) generated_article, sources = group_sentences(articles) title = "" art_text = "" for sentence in generated_article: art_text += sentence[0] + " " if len(generated_article) > 0: title = create_title(art_text) else: title = "Sorry, we couldn't find any related articles!" #generate the text and display some how tit_text = title.decode('utf8') art_text = art_text.decode('utf8') return render_template("home.html", title=tit_text, article=art_text)
def __update_category_articles(self, category): response = urllib2.urlopen(self.url + category, timeout=20) js_data = response.read().decode("ISO-8859-1").replace("\\'", "'").replace(",\n]", "]") for article_data in self.json_decoder.decode(js_data): #get the image img_data = None if article_data['image_url'] != '' and self.manager.find(article_data['id']) is None: try: img_response = urllib2.urlopen(article_data['image_url'], timeout=20) img_data = buffer(img_response.read()) except: pass article = Article(article_id=article_data['id'], category=category, title=article_data['title'], timestamp=article_data['timestamp'], body=article_data['body'], image=img_data, read=0) self.manager.save(article)
from utils import vectorize_text from utils import split_sentences from utils import self_correlate from utils import bow_caps from utils import average_sentiment from utils import words_sentiment from utils import create_title CUTOFF = 0.5 articles = [] for file in os.listdir("articles/"): if file.endswith(".txt"): text = open(os.path.join("articles/", file), "r").read() source = file[:file.index("-")] articles.append(Article(text, source)) groups = cluster(articles) num_groups = max(groups) + 1 groupings = [[] for _ in range(num_groups)] for i in range(len(groups)): group_num = groups[i] article = articles[i] groupings[group_num].append(article) correlations = [] for group in groupings: correlations.append((average_relation(group), len(group))) i = len(groupings) - 1 while i >= 0: group = groupings[i]
def parseArticleHtml(articleListFilePath): ''' @summary: Parse the html page for each article by using BeautifufSoup and save them into JSON format. @return: Return a list of Article objects that contain details of each article. ''' articleList = None with open(articleListFilePath) as data_file: articleList = json.load(data_file) results = set() for i in xrange(len(articleList)): article = Article() article = jsonhelper.simple_dict_to_object(articleList[i], article) fileName = article.link[20:].replace('/', '-',3).replace('/', '') + '.html' #fileName = '2016-02-23-teens-marijuana-photos.html' filePath = constants.ArticleHtmlDir + fileName #filePath = constants.ArticleHtmlDir + '2016-02-18-amazon-studios-picks-up-untitled-woody-allen-movie.html' # parse html if file exists if(os.path.isfile(filePath)): articleFile = open(filePath) try: bs = BeautifulSoup(articleFile, 'html.parser') # get total share shareNode = bs.find('div', {'class': 'total-shares'}) if(shareNode): article.shares = shareNode.get_text().replace('\n', '').replace('Shares', '') else: shareNode = bs.find(lambda tag: tag.has_attr('data-shares')) article.shares = shareNode.get('data-shares') if(article.shares.endswith('k')): article.shares = int(float(article.shares[:-1]) * 1000) article.shares = int(article.shares) # Get Number of Links links = bs.find_all('a') article.num_hrefs = len(links) # Get links to other articles otherArticleLinks = bs.find_all('a', {'href': lambda value: value and re.search('mashable.com/\d{4}/\d{2}/\d{2}/', value)}) article.num_self_hrefs = len(otherArticleLinks) # Get content tag contentTag = bs.find('section', {'class': lambda value: value and value.startswith('article-content')}) #video type article is different if(not contentTag): contentTag = bs.find('section', {'id': 'video-matting'}) # now another type, seems post photos if(not contentTag): contentTag = bs.find('div', {'id': 'skrollr-body'}) #also some article in iframe if(not contentTag): iframeDivTag = bs.find(lambda tag: tag.has_attr('data-url')) if(iframeDivTag): iframeUrl = iframeDivTag.get('data-url') res = requests.get(iframeUrl) iframeContent = res.text bsIframe = BeautifulSoup(iframeContent, 'html.parser') contentTag = bsIframe.find('div', {'id': 'content'}) # Get number of images in the article images = contentTag.find_all('img') if(images): article.num_imgs = len(images) # Get number of videos in the article youtubeVideos = contentTag.find_all(lambda tag: tag.has_attr('src') and 'youtube.com' in tag.get('src')) ooyalaVideos = contentTag.find_all(lambda tag: tag.has_attr('data-video')) article.num_videos = len(youtubeVideos) + len(ooyalaVideos) # get topics footerTopicsTag = bs.find('footer', {'class': 'article-topics'}) if(footerTopicsTag): article.topics = footerTopicsTag.get_text().replace("Topics:", "").replace("\n", "") else: # assume it is from iframe if not found in footer jsTag = bs.find("script", {'type': 'application/ld+json'}) scriptContent = jsTag.get_text() dic = json.loads(scriptContent.decode('utf-8')) #print dic #dic = ast.literal_eval(scriptContent) article.topics = dic['metadata']['omnitureData']['topics'] # get Days between the article publication and the dataset created post_date = datetime.strptime(article.post_date[0:19], '%Y-%m-%dT%H:%M:%S') article.timedelta = (datetime.now() - post_date).days # get number of keywords from meta in head keywords = bs.head.find('meta', {'name': 'keywords'}).get('content') #print 'keywords: ' + keywords article.num_keywords = len(keywords.split(',')) contentBlob = TextBlob(article.content) # Number of words in the content article.n_tokens_content = len(contentBlob.words) # article sentiment article.content_sentiment_polarity = contentBlob.sentiment.polarity article.content_subjectivity = contentBlob.sentiment.subjectivity titleBlob = TextBlob(article.title) # Number of words in the title article.n_tokens_title = len(titleBlob.words) # title sentiment article.title_sentiment_polarity = titleBlob.sentiment.polarity article.title_subjectivity = titleBlob.sentiment.subjectivity #results.add(article.to_dict()) results.add(article) #print article print 'Parsed: ' + fileName except Exception as ex: print 'Error in: ', fileName traceback.print_exc() finally: articleFile.close() else: print 'File not found: ' + fileName ''' i += 1 if(i > 3): break ''' return results
def article_pairs(trg_lang, src_lang): trg_cur.execute('select * from articles where lang = ?', (trg_lang, )) for article in trg_cur.fetchall(): trg_article = Article(*article) src_article = find_translation(trg_article, src_lang, src_cur) yield trg_article, src_article
#!/usr/bin/env python from clustering import get_clusters from articles import Article def accuracy(actual_good, actual_bad, predicted_good, predicted_bad): bad_count = 0 for bad in predicted_bad: if bad in actual_bad: bad_count += 1 good_count = 0 for good in predicted_good: if good in actual_good: good_count += 1 return if __name__ == '__main__': # articles_good = Article.read_full('./atrinkti_saulius/geri_straipsniai') # articles_bad = Article.read_full('./atrinkti_saulius/blogi_straipsniai') articles_good = Article.read_full('./atrinkti_ginte/geri') articles_bad = Article.read_full('./atrinkti_ginte/blogi') articles = articles_good + articles_bad clusters = get_clusters(articles) import pdb pdb.set_trace()
def get_articles(): articles = [] for article_json in os.listdir(ARTICLES_PATH): articles.append(Article(os.path.join(ARTICLES_PATH, article_json))) active_articles = [article for article in articles if article.active] return active_articles
def parseArticleHtml(articleListFilePath): ''' @summary: Parse the html page for each article by using BeautifufSoup and save them into JSON format. @return: Return a list of Article objects that contain details of each article. ''' articleList = None with open(articleListFilePath) as data_file: articleList = json.load(data_file) results = set() for i in xrange(len(articleList)): article = Article() article = jsonhelper.simple_dict_to_object(articleList[i], article) fileName = article.link[20:].replace('/', '-', 3).replace('/', '') + '.html' #fileName = '2016-02-23-teens-marijuana-photos.html' filePath = constants.ArticleHtmlDir + fileName #filePath = constants.ArticleHtmlDir + '2016-02-18-amazon-studios-picks-up-untitled-woody-allen-movie.html' # parse html if file exists if (os.path.isfile(filePath)): articleFile = open(filePath) try: bs = BeautifulSoup(articleFile, 'html.parser') # get total share shareNode = bs.find('div', {'class': 'total-shares'}) if (shareNode): article.shares = shareNode.get_text().replace( '\n', '').replace('Shares', '') else: shareNode = bs.find( lambda tag: tag.has_attr('data-shares')) article.shares = shareNode.get('data-shares') if (article.shares.endswith('k')): article.shares = int(float(article.shares[:-1]) * 1000) article.shares = int(article.shares) # Get Number of Links links = bs.find_all('a') article.num_hrefs = len(links) # Get links to other articles otherArticleLinks = bs.find_all( 'a', { 'href': lambda value: value and re.search( 'mashable.com/\d{4}/\d{2}/\d{2}/', value) }) article.num_self_hrefs = len(otherArticleLinks) # Get content tag contentTag = bs.find( 'section', { 'class': lambda value: value and value.startswith( 'article-content') }) #video type article is different if (not contentTag): contentTag = bs.find('section', {'id': 'video-matting'}) # now another type, seems post photos if (not contentTag): contentTag = bs.find('div', {'id': 'skrollr-body'}) #also some article in iframe if (not contentTag): iframeDivTag = bs.find( lambda tag: tag.has_attr('data-url')) if (iframeDivTag): iframeUrl = iframeDivTag.get('data-url') res = requests.get(iframeUrl) iframeContent = res.text bsIframe = BeautifulSoup(iframeContent, 'html.parser') contentTag = bsIframe.find('div', {'id': 'content'}) # Get number of images in the article images = contentTag.find_all('img') if (images): article.num_imgs = len(images) # Get number of videos in the article youtubeVideos = contentTag.find_all(lambda tag: tag.has_attr( 'src') and 'youtube.com' in tag.get('src')) ooyalaVideos = contentTag.find_all( lambda tag: tag.has_attr('data-video')) article.num_videos = len(youtubeVideos) + len(ooyalaVideos) # get topics footerTopicsTag = bs.find('footer', {'class': 'article-topics'}) if (footerTopicsTag): article.topics = footerTopicsTag.get_text().replace( "Topics:", "").replace("\n", "") else: # assume it is from iframe if not found in footer jsTag = bs.find("script", {'type': 'application/ld+json'}) scriptContent = jsTag.get_text() dic = json.loads(scriptContent.decode('utf-8')) #print dic #dic = ast.literal_eval(scriptContent) article.topics = dic['metadata']['omnitureData']['topics'] # get Days between the article publication and the dataset created post_date = datetime.strptime(article.post_date[0:19], '%Y-%m-%dT%H:%M:%S') article.timedelta = (datetime.now() - post_date).days # get number of keywords from meta in head keywords = bs.head.find('meta', { 'name': 'keywords' }).get('content') #print 'keywords: ' + keywords article.num_keywords = len(keywords.split(',')) contentBlob = TextBlob(article.content) # Number of words in the content article.n_tokens_content = len(contentBlob.words) # article sentiment article.content_sentiment_polarity = contentBlob.sentiment.polarity article.content_subjectivity = contentBlob.sentiment.subjectivity titleBlob = TextBlob(article.title) # Number of words in the title article.n_tokens_title = len(titleBlob.words) # title sentiment article.title_sentiment_polarity = titleBlob.sentiment.polarity article.title_subjectivity = titleBlob.sentiment.subjectivity #results.add(article.to_dict()) results.add(article) #print article print 'Parsed: ' + fileName except Exception as ex: print 'Error in: ', fileName traceback.print_exc() finally: articleFile.close() else: print 'File not found: ' + fileName ''' i += 1 if(i > 3): break ''' return results