def refresh(): articles = [article()] try: with open('articles.json') as f: articles += [article(_) for _ in json.load(f)] except FileNotFoundError: pass except json.decoder.JSONDecodeError: os.remove('articles.json') with open('articles.json', 'w+') as f: json.dump([a.__dict__ for a in articles], f) return redirect('/', code=303)
def MPsentiment(self, i): filename = self.path+"/"+self.files[i] print(filename) a = article(filename, self.city, self.files[i]) self.articles.append(a) if a.data == None: a.calculateWords()
def read_json(self, input_file, quotechar=None): lists = [] with codecs.open(input_file, "r", "utf-8") as fd: rc = json.loads(fd.read()) for item in rc: artical_ins = article() label = "None" if "label" not in item.keys() else str( item["label"]) title = "None" if "title" not in item.keys() else item["title"] content = "None" if "content" not in item.keys( ) else item["content"] ctime = "None" if "ctime" not in item.keys() else str( item["ctime"]) url = "None" if "url" not in item.keys() else item["url"] uuid = "None" if "uuid" not in item.keys() else item["uuid"] artical_ins.label = label.replace("\n", "").replace( "\r", "").replace("\t", "") artical_ins.title = title.replace("\n", "").replace( "\r", "").replace("\t", "") artical_ins.content = content.replace("\n", "").replace( "\r", "").replace("\t", "") artical_ins.ctime = ctime.replace("\n", "").replace( "\r", "").replace("\t", "") artical_ins.url = url.replace("\n", "").replace("\r", "").replace( "\t", "") artical_ins.uuid = uuid.replace("\n", "").replace("\r", "").replace( "\t", "") lists.append(artical_ins) lists = list(self._dedupe(lists, key=lambda a: a.title)) return lists
def test_profile_author(): url = "https://www.nytimes.com/2020/02/17/world/asia/coronavirus-westerdam-cambodia-hun-sen.html?action=click&module=Top%20Stories&pgtype=Homepage" test_article = article(url) article_result = test_article.get() test_author = authorCard(article_result["article_id"], article_result["author_page_link"], article_result["profile"], article_result["author_name"]) print(test_author.get())
def main(): f1 = open('plots.txt', 'r') f2 = open('titles.txt', 'r') f_plots = f1.read() f_titles = f2.read() plots = f_plots.split("<EOS>") titles = f_titles.split("\n") articles = [] for i in range(len(titles)): articles.append(article(titles[i], plots[i])) csvData = [] csvData.append(["Title", "NW_Title", "ND_Title", "NW_Plot", "ND_Plot"]) for i in range(len(titles)): csvData.append([ articles[i].title, articles[i].wordsTitle, articles[i].digitsTitle, articles[i].wordsPlot, articles[i].digitsPlot ]) csv.register_dialect('mD', quoting=csv.QUOTE_ALL, skipinitialspace=True) with open('dataset2.csv', 'w') as csvFile: writer = csv.writer(csvFile, dialect='mD') writer.writerows(csvData) csvFile.close()
def nyt_scraper(my_url): ''' Returns a list of article objects from the scraped BBC news URL You can chose different URLs for BBC RSS to serve up ''' # my_url = # URL of rss feed / whatever you need uClient = urllib.urlopen(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html,'xml') article_list = [] for item in page_soup.findAll('item'): # For each RSS article title = item.title.text.encode("utf-8") description = item.description.text.encode("utf-8") date = str(item.pubDate.text) link = item.link.text uClient = urllib.urlopen(link) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") stories = page_soup.findAll("p") body = "" for bod in stories: # Gets the strings from the paragraphs body = body + bod.text if len(body) > 0: temp = article(title,description,date,link,body) article_list.append(temp) return article_list
def func_article(artname): instance = handleSta.handleSta() instance.read() instance.setValue(artname, instance.getValue(artname, 0) + 1) instance.setValue('pv', instance.getValue('pv', 0) + 1) instance.write() read = article.article('./article/' + artname) return read.read()
def articles(guid): try: with open('articles.json') as f: articles = [article(_) for _ in json.load(f)] for art in articles: if (art.guid == guid): return render_template('article.html', article=art) except FileNotFoundError: return redirect('refresh', code=303) return redirect('/', code=303)
def index(*args): articles = [] try: with open('articles.json') as f: try: articles = [article(_) for _ in json.load(f)] except json.decoder.JSONDecodeError: os.remove('articles.json') return redirect('refresh', code=303) except FileNotFoundError: return redirect('refresh', code=303) return render_template('main.html', lines=articles)
def get(self): ''' The main function in citationsNetwork to check if publisher information exists in citations table And use Article class to obtain information for each citation link :return: a json dictionary that contains: 'article_paragraphs': [] 'citation_links': [] 'citation_info': {'link': {'article_title', 'article_content', 'article_credibility'}} :return None if article is not in database ''' json_dict = dict() # extract information from database citation_results = self.db.lookup_citation(self.article_id) if citation_results is not None: article_paragraphs = citation_results[0] citation_links = citation_results[1] # obtain specific info for each citation link ## TODO: Advance parallel processing ## NOW: limit three citations only (for convenience) citation_info = dict() count = 0 for i in range(len(citation_links)): one_info = dict() # skip non-existed citations if citation_links[i] == "None" or count >= 3: citation_links[i] = "None" continue cited_article = article(citation_links[i]) article_result = cited_article.get() if article_result is not None: # create inner dict to store information for one citation one_info['article_title'] = article_result['article_title'] one_info['article_content'] = article_result[ 'article_content'] one_info['article_credibility'] = article_result[ 'article_reliability'] citation_info[citation_links[i]] = one_info count += 1 else: # delete non-profile citation link citation_links[i] = "None" json_dict['article_paragraphs'] = article_paragraphs json_dict['citation_links'] = citation_links json_dict['citation_info'] = citation_info return json_dict else: return None
def polarity(): articles= [] #store the corresponding percentage percent= [] polarity=0 i =0 for x in files: filename = path+"/"+files[i] articles.append(article(filename)) articles[i].calculateWords() percent[i] = articles[i].formula() i +=1 for j in percent: polarity += percent[j]
def quantify(word, number=0): """ Returns a phrase describing the number of given objects. Two objects are described as being a pair, smaller than eight is several, smaller than twenty is a number of, smaller than two hundred are dozens, anything bigger is described as being tens or hundreds of thousands or millions. For example: chicken, 100 -> dozens of chickens """ def _plural(word): return plural(word, custom=quantify_custom_plurals) if number == 0: return "no " + _plural(word) if number == 1: return article(word) if number == 2: return "a pair of " + _plural(word) if number in range(3,8): return "several " + _plural(word) if number in range(8,20): return "a number of " + _plural(word) if number in range(20,200): return "dozens of " + _plural(word) if number >= 200: thousands = int( log(number, 10) / 3 ) subthousands = int( log(number, 10) % 3 ) if subthousands == 2: stword = "hundreds of " elif subthousands == 1: stword = "tens of " else: stword = "" if thousands > 0: thword = _plural(numeral_thousands(thousands-1)) + " of " else: thword = "" return stword + thword + _plural(word)
def quantify(word, number=0): """ Returns a phrase describing the number of given objects. Two objects are described as being a pair, smaller than eight is several, smaller than twenty is a number of, smaller than two hundred are dozens, anything bigger is described as being tens or hundreds of thousands or millions. For example: chicken, 100 -> dozens of chickens """ def _plural(word): return plural(word, custom=quantify_custom_plurals) if number == 0: return "no " + _plural(word) if number == 1: return article(word) if number == 2: return "a pair of " + _plural(word) if number in range(3, 8): return "several " + _plural(word) if number in range(8, 20): return "a number of " + _plural(word) if number in range(20, 200): return "dozens of " + _plural(word) if number >= 200: thousands = int(log(number, 10) / 3) subthousands = int(log(number, 10) % 3) if subthousands == 2: stword = "hundreds of " elif subthousands == 1: stword = "tens of " else: stword = "" if thousands > 0: thword = _plural(numeral_thousands(thousands - 1)) + " of " else: thword = "" return stword + thword + _plural(word)
def article(request): print "Running tests" description = request.GET.get('description') if description == None: return render_to_response('project/article.html', {'description': description}) queryArticle = a.article(body=description) facts = extraction.extractFacts(queryArticle) reports = [] for fact in facts: reports.append(filter_f.filter_facts(fact, facts_db)) return render_to_response('project/article.html', { 'description': description, 'reports': reports })
def index(): art = os.listdir('./article') give = [] instance = handleSta.handleSta() instance.read() for item in art: path = './article/' + item read = article.article(path) instance.setValue(item, instance.getValue(item, 0) ) give.append([item, read.read_title() + ' (' + read.read_time() + u'浏览 ' + str(instance.getValue(item, 0)) + ')']) give = sorted(give)[::-1] getIp = ip.ip().getIpInfo() instance.setValue('pv', instance.getValue('pv', 0) + 1) instance.write() return jinja2_template('templates/home.html', domain = settings.domain, users = give, ipInfo = getIp, pv = instance.getValue('pv', 0))
def main(): personas = generatePersonas() # for subdir, dirs, files in os.walk("../dataset/articles/"): for file in files: filepath = subdir + os.sep + file print (file) if file != ".DS_Store": with codecs.open(filepath, "r",encoding='utf-8') as myfile: wholeText = myfile.read() print wholeText for word in wholeText: if not isAscii(word): wholeText.replace(word, '') arcl = article.article(wholeText, filepath) for person in personas: val = person.personaConsumer(arcl.tagList) if val == 1: person.articles.append(arcl) savePersonas(personas)
def ap_scraper(my_url): ''' returns a list of article objects from the scraped AP news URL you can chose different URLs for Associated Press RSS to serve up ''' # my_url = "http://feeds.bbci.co.uk/news/rss.xml" # URL of rss feed / whatever you need uClient = urllib.urlopen(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, 'xml') titles = [] descriptions = [] dates = [] links = [] for item in page_soup.findAll( 'item'): # I can make this better space efficiency and time titles.append(item.title.text.encode("utf-8")) descriptions.append(item.description.text.encode("utf-8")) dates.append(str(item.pubDate.text)) links.append(item.title.text) l = [titles, descriptions, dates, links] article_list = [] for i in range(len(l[0])): temp = article(l[0][i], l[1][i], l[2][i], l[3][i]) article_list.append(temp) return article_list # print l[0][i] # print l[1][i] # print l[2][i] # print # for thing in article_list: # print thing # print
def process_author_credibility(self, author_article_list): ''' helper function for get use scrapy crawler to store author credibility information and NELA tool to generate credibility scores :param author_article_list, a list of article link from this author ''' # use at most recently 5 article to generate reliability and bias score for author accumlated_reliability = list() accumlated_bias = list() for article_link in author_article_list: # get article credibility to process each_article = article(article_link) article_result = each_article.get() print(article_link) if article_result is None: continue if article_result["article_reliability"] >= 0: accumlated_reliability.append( article_result["article_reliability"]) accumlated_bias.append(article_result["article_bias"]) if len(accumlated_reliability) >= 5: break if len(accumlated_reliability) != 0: avg_reliability = self.sum_list_scores(accumlated_reliability) avg_bias = self.sum_list_scores(accumlated_bias) else: # -1 represents Not available reliability/bias avg_reliability = -100 avg_bias = -100 self.db.insert_author_credibility(self.author_id, avg_reliability, avg_bias)
def wpo_scraper(my_url): ''' Returns a list of article objects from the scraped BBC news URL You can chose different URLs for BBC RSS to serve up ''' # my_url = "http://feeds.washingtonpost.com/rss/rss_election-2012" # URL of rss feed / whatever you need uClient = urllib.urlopen(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, 'xml') article_list = [] for item in page_soup.findAll('item'): # For each RSS article title = item.title.text.encode("utf-8") description = item.description.text.encode("utf-8") date = str(item.pubDate.text) link = item.link.text print title print description uClient = urllib.urlopen(link) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "xml") stories = page_soup.findAll("div", {"class": "article-body"}) body = "" for bod in stories: # Gets the strings from the paragraphs paragraphs = bod.findAll("p") for pars in paragraphs: body = body + pars.text if len(body) > 0: temp = article(title, description, date, link, body) article_list.append(temp) return article_list
def storeStories(): #print("Updating articles.") news.drop() stories = [] year = datetime.now().year month = datetime.now().month NYT = requests.get("https://api.nytimes.com/svc/archive/v1/" + str(year) + "/" + str(month) + ".json?api-key=" + os.environ.get('NYT_API')).json() for story in NYT["response"]["docs"][-1 * min(len(NYT["response"]["docs"] ), 100):]: art = getNYTArticle(story['web_url']) subjectivity, grade, sentimentality = subjectivity_and_grade(art) datetime_obj = datetime.strptime(story["pub_date"], '%Y-%m-%dT%H:%M:%S+%f') date_string = str(datetime_obj.month) + "/" + str( datetime_obj.day) + "/" + str(datetime_obj.year) a = article(int(grade), story["web_url"], "NYT", story["headline"]["main"], story["lead_paragraph"], date_string, subjectivity, sentimentality) stories.append(vars(a)) news.insert_many(stories)
def exportsinglearticle(url, silent=False): ''' Bind method to put article from a url to a txt file Variable type: url - String ''' url = url_decorator(url) article_wanted = article(url) title = article_wanted.gettitle() author = article_wanted.getauthor() content = article_wanted.getcontent() summary = article_wanted.getsummary() notes = article_wanted.getnotes() chapter = article_wanted.getchap() related_chaps = article_wanted.get_related_chaps() if silent == False: print('Exporting ' + title) write_totxt('./article', title=title, author=author, content=content, chapter=chapter, summary=summary, notes=notes) return related_chaps
if(day < 10): URL = year + month + '0' + str(day) + '.html' else: URL = year + month + str(day) + '.html' page = requests.get(URL) tree = html.fromstring(page.text) URLs = tree.xpath('//div[@class="headlineMed"]/a/@href') date = URL[-13:-5] f = open('output/' + str(date) + '.txt', 'w') # generate the random vector(python generate a sample without # replacement from a range of numbers) for num in random.sample(range(0, len(URLs)), int(len(URLs))): doc = ac.article('', date, '', URLs[num], -1) curpage = requests.get(doc.URL) curtree = html.fromstring(curpage.text) Title = curtree.xpath('//*[@id="content"]/div[4]/div/div[3]/div[1]/h1/text()') Paragraphs = curtree.xpath('//*[@id="articleText"]/p/text()') if len(Title) > 0: doc.Title = Title[0].replace('\"', '') Paragraphs.append(Title[0]) doc.Text = " ".join(Paragraphs) doc.Text = doc.Text.replace('\n', ' ') doc.Text = doc.Text.replace('\"', '') if(len(doc.Text.split()) > 100): docId = docId + 1 doc.id = docId print doc.id
} }).sort([('mostComplete', 1)]).limit(5) recents = db.table.find().sort([('recentlyUpdated', -1)]).limit(5) complete_articles = formatter['article_list'](complete) recent_articles = formatter['article_list'](recents) all_articles = formatter['article_list'](articles) """ #get the most updated articles for article in articles: title = article['title'] author = formatter['authorIDs'](article['authorIDs']) year = article['year'] url = "http://%s?doi=%s" % (article_url, article['doi']) output += "<a href='%s'>%s, %s, %s</a><br/>" % (url, title, author, year) """ output = index_template.render_unicode(recents=recent_articles, completes=complete_articles, all_articles=all_articles) return output if __name__ == '__main__': cherrypy.config.update({'server.socket_host': url}) cherrypy.config.update({'server.socket_port': index_port}) index = index() index.article = article.article() cherrypy.quickstart(index)
def getchapurls(url): articls_instance = article(url) return articls_instance.get_related_chaps()
def test_profile_article(): url = "https://www.nytimes.com/2020/02/17/world/asia/coronavirus-westerdam-cambodia-hun-sen.html?action=click&module=Top%20Stories&pgtype=Homepage" test_article = article(url) result = test_article.get() print(result)
def set_the_article_file_location(step): world.article = article() world.article.set_file_location(test_xml_path, world.document)
# print 'Input format is "', input # print 'Output format is "', output # print 'DOI is "', doi # Check for minimum parameters - need input and output if ( (input == None and output == None) or (input == "fluidinfo" and doi == None) or (input == "fluidinfo" and doi != None and output == None) ): print >> sys.stderr, "Insufficient parameters supplied" print >> sys.stderr, "for help use --help" return 2 if input == "fluidinfo" and doi != None: a = article.article(doi) a.load_from_fi() if output == "article": # Test output print json.dumps(a.data(), sort_keys=True, indent=4) elif output == "fluidinfo": # In from fluidinfo, out to fluidinfo (not recommended except for testing purposes) load_article_into_fi(a) elif input != "fluidinfo": # Presumably a document name provided a = load_article(input) if output == "article": # Test output print json.dumps(a.data(), sort_keys=True, indent=4)
complete = db.table.find({'mostComplete' : {'$gte' : 1}}).sort([('mostComplete', 1)]).limit(5) recents = db.table.find().sort([('recentlyUpdated', -1)]).limit(5) complete_articles = formatter['article_list'](complete) recent_articles = formatter['article_list'](recents) all_articles = formatter['article_list'](articles) """ #get the most updated articles for article in articles: title = article['title'] author = formatter['authorIDs'](article['authorIDs']) year = article['year'] url = "http://%s?doi=%s" % (article_url, article['doi']) output += "<a href='%s'>%s, %s, %s</a><br/>" % (url, title, author, year) """ output = index_template.render_unicode(recents = recent_articles, completes = complete_articles, all_articles=all_articles) return output if __name__ == '__main__': cherrypy.config.update({'server.socket_host':url}) cherrypy.config.update({'server.socket_port':index_port}) index = index() index.article = article.article() cherrypy.quickstart(index)
def process_URL(self): ''' Set up article class to process and store information in self.article_content set up author card class to process and store in self.author_card :return: set self.URL_status to True if the article is crawled successfully Otherwise, set self.URL_status to False ''' # set up article class new_article = article(self.URL) article_result = new_article.get() if article_result is not None: self.URL_status = True # store article information to self.article_content self.article_content = dict() self.article_content["article_title"] = article_result[ "article_title"] self.article_content["article_content"] = article_result[ "article_content"] self.article_content["author_name"] = article_result["author_name"] self.article_content["publisher_name"] = article_result[ "publisher_name"] self.article_content["article_reliability"] = article_result[ "article_reliability"] self.article_content["article_bias"] = article_result[ "article_bias"] # set up author card class new_author_card = authorCard( article_id=article_result["article_id"], author_page_link=article_result["author_page_link"], profile=article_result["profile"], author_name=article_result["author_name"]) author_result = new_author_card.get() # store author information to self.author_card if author_result is not None: self.author_card = dict() self.author_card["author_name"] = author_result["author_name"] self.author_card["author_introduction"] = author_result[ "author_intro"] self.author_card["author_reliability_score"] = author_result[ "author_reliability"] self.author_card["author_bias_score"] = author_result[ "author_bias"] self.author_card["author_link"] = author_result["author_link"] else: self.author_card = None #set up publisher card class new_publisher_card = publisherCard(article_result["profile"]) publisher_result = new_publisher_card.get() # store publisher information to self.publisher_card if publisher_result is not None: self.publisher_card = dict() self.publisher_card["publisher_name"] = publisher_result[ "publisher_name"] self.publisher_card[ "publisher_introduction"] = publisher_result[ "publisher_intro"] self.publisher_card[ "publisher_reliability_score"] = publisher_result[ "publisher_reliability_score"] self.publisher_card["publisher_link"] = publisher_result[ "publisher_link"] # set up citation network class new_citation_network = citationsNetwork( article_result["article_id"]) citation_network_result = new_citation_network.get() if citation_network_result is not None: # store partial information in article_content self.article_content[ "article_paragraphs"] = citation_network_result[ "article_paragraphs"] self.article_content[ "citation_links"] = citation_network_result[ "citation_links"] # store specific citation info to self.citation_network self.citation_network = citation_network_result[ "citation_info"] else: self.citation_network = None else: self.article_content = dict() self.URL_status = False
def article(self, word): return article.article(word)
def load_article(document=None): # Build an article object from XML file path = None a = article.article() a.parse_document(path, document) return a
def testCall(): testArticle = article(title, description, date, link, body) print extractFacts(testArticle)
def aboutme(): read = article.article('./special/aboutme.md') return read.read()
if (day < 10): URL = year + month + '0' + str(day) + '.html' else: URL = year + month + str(day) + '.html' page = requests.get(URL) tree = html.fromstring(page.text) URLs = tree.xpath('//div[@class="headlineMed"]/a/@href') date = URL[-13:-5] f = open('output/' + str(date) + '.txt', 'w') # generate the random vector(python generate a sample without # replacement from a range of numbers) for num in random.sample(range(0, len(URLs)), int(len(URLs))): doc = ac.article('', date, '', URLs[num], -1) curpage = requests.get(doc.URL) curtree = html.fromstring(curpage.text) Title = curtree.xpath( '//*[@id="content"]/div[4]/div/div[3]/div[1]/h1/text()') Paragraphs = curtree.xpath('//*[@id="articleText"]/p/text()') if len(Title) > 0: doc.Title = Title[0].replace('\"', '') Paragraphs.append(Title[0]) doc.Text = " ".join(Paragraphs) doc.Text = doc.Text.replace('\n', ' ') doc.Text = doc.Text.replace('\"', '') if (len(doc.Text.split()) > 100): docId = docId + 1 doc.id = docId
def query(self, title): self.cursor.execute(self.query_by_title_sql, (title, )) data = self.cursor.fetchall()[0] art = article(data[1], data[2], data[3], data[4]) return art.__dict__