def save_to_analysis(self): print("calling save_to_analysis with {}".format(self.fromCountry)) today = timezone.now() dates = date(int(today.year), int(today.month), int(today.day)) data = newsData.objects.filter(publisher=self.publisher, date=dates) analyzerObj = analyzer() to_country = (('USA'), ('CHN'), ('KOR'), ('PRK'), ('JPN')) dataExport = dict() average = 0 for to_c in to_country: for datum in data: dataExport = analyzerObj.toRelationship(datum, to_c) try: #if the query already exists. add appropriate ID to the list. check = collectedData.objects.get( fromCountry=self.fromCountry, toCountry=to_c, date=dates) if dataExport['toCheck'] and dataExport[ 'toID'] not in check.toID: #when the searched collectedData exists, but the dataExport data has not been entered. check.toCheck = dataExport['toCheck'] check.to_num += dataExport['to_num'] if check.toID is "": check.toID = dataExport['toID'] else: check.toID += " " + dataExport['toID'] check.sumcompound += dataExport['compoundSum'] check.avgcompound = check.sumcompound / check.to_num check.total_num = analyzerObj.total_articles( self.publisher) #regardless, always update total_num. check.save() except ObjectDoesNotExist: #when the searched collectedData doesn't exists, and the dataExport data has not been entered. if dataExport['to_num'] is not 0: average = dataExport['compoundSum'] / dataExport[ 'to_num'] else: average = 0 update = collectedData.objects.create( fromCountry=self.fromCountry, total_num=analyzerObj.total_articles(self.publisher), to_num=dataExport['to_num'], toCountry=to_c, toCheck=dataExport['toCheck'], toID=dataExport['toID'], sumcompound=dataExport['compoundSum'], avgcompound=average)
def get_Contents(self): # getURL = 'https://newsapi.org/v2/top-headlines?sources=the-new-york-times&apiKey={0}'.format(settings.NYTIMES_API_KEY) getURL = 'https://api.nytimes.com/svc/topstories/v2/home.json?api-key={0}'.format( settings.NYTIMES_API_KEY) r = requests.get(getURL) json_data = r.json() dataExport = dict() #export crawled info out for data store compoundValue = 0 #sentiment score #parse json # for article in json_data['articles']: #old api for article in json_data['results']: title = article['title'] url = article['url'] # time = article['publishedAt'][:10] #old api time = article['published_date'][:10] #parse html r2 = requests.get(url) soup = BSoup(r2.text, 'html.parser') if soup is None: continue #if url is not obtainable, skip to next one pDict = dict() #p tag dictionary nonReg = False #check if the main body of articles has expected class name or non-expected class name (g-...) #parse p tag in html #one type of article. main body of articels has expected class name (i.e. css-1i0edl6 e2kc3sl0) for p in soup.find_all('p'): if p.get('class') is not None: pName = " ".join(p.get('class')) freq = pDict.get(pName) if freq is not None: freq += 1 else: freq = 1 pDict.update({pName: freq}) #different type of article. main body of articles has 'g-..' for p in pDict: if 'g-' in p: nonReg = True pDict = dict() #reset for div in soup.find_all('div'): if div.get('class') is not None: divName = " ".join(div.get('class')) freq = pDict.get(divName) if freq is not None: freq += 1 else: freq = 1 pDict.update({divName: freq}) #Figure out the most number of class name; #as I'm assuming the most nubmer of class name contain the article contents count = 0 pMost = None for p in pDict.keys(): if pDict[p] >= count: count = pDict[p] pMost = p #If there exsits g- class name, main articles are in div tag; otherwise it's in p tag. if not nonReg: newsContent = soup.find_all("p", class_=pMost) else: newsContent = soup.find_all("div", class_=pMost) #if unable to retrieve the contents, skip the article [safebox] if newsContent is None: continue strContainer = "" #contents for analysis strContainer.encode(encoding='UTF-8', errors='strict') for content in newsContent[:]: if nonReg: if content.p is None: continue strContainer = strContainer + " " + content.p.text.replace( '\n', '') else: strContainer = strContainer + " " + content.text #run analysis class methods to obtain sentiment score (compoundValue) and most frequent words (wordFreq) analyzerObj = analyzer() compoundValue = analyzerObj.senti_Analysis(strContainer) wordFreq = analyzerObj.word_freq(strContainer) if len(strContainer) < 20: isArticle = False else: isArticle = True #get one article dataExport.update({ title: { 'url': url, 'time': time, 'compound': compoundValue, 'word_freq': wordFreq, 'isArticle': isArticle } }) return dataExport
def xmlParse(self, xmlurl, publisher, keyword): r = requests.get(xmlurl) soup = BSoup(r.text, 'xml') dataExport = dict() compoundvalue = 0 #parse xml for item in soup.find_all('item'): if publisher is 'yonhap': if '(Copyright)' in item.title.string: continue title = item.title.string url = item.link.string if not newsData.objects.filter( title=title).exists(): #if it doesn't already exists. #publish date if publisher is 'yonhap': time = item.pubDate.string[0:4] + '-' + item.pubDate.string[ 4:6] + '-' + item.pubDate.string[6:8] elif publisher is 'ecns': time = item.pubDate.string[:10] elif publisher is 'japantimes': time = datetime.datetime.strptime( item.pubDate.string[5:16], "%d %b %Y").strftime("%Y-%m-%d") #parse html r2 = requests.get(url) soup2 = BSoup(r2.text, 'html.parser') if soup2 is None: continue #if url is not obtainable, skip to next one strContainer = "" #contents for analysis strContainer.encode(encoding='UTF-8', errors='strict') if publisher is 'yonhap' or publisher is 'ecns': newsContent = soup2.find_all("div", class_=keyword) elif publisher is 'japantimes': newsContentfind = soup2.find('div', id=keyword) if newsContentfind is not None: newsContent = newsContentfind.findAll('p') else: continue #if article is not obtainable, skip to the next articles if newsContent is None: continue for content in newsContent: strContainer = strContainer + " " + content.text if len(strContainer) < 20: isArticle = False else: isArticle = True analyzerObj = analyzer() compoundValue = analyzerObj.senti_Analysis(strContainer) wordFreq = analyzerObj.word_freq(strContainer) dataExport.update({ title: { 'url': url, 'time': time, 'compound': compoundValue, 'word_freq': wordFreq, 'isArticle': isArticle } }) return dataExport