def addToDict(): masterList = api.generateResponse() for x in range(0, len(masterList)): for y in range(0, len(masterList[x])): for z in range(0, 4): try: locations = nlp.HTMLParser(masterList[x][y][1]) for z in range(0, len(locations)): location = str(getAddress(str(locations[z]))) if location not in aggregatedDict: aggregatedDict[location] = [] listInfo = aggregatedDict[location] tempDict = {} tempDict['url'] = masterList[x][y][1] tempDict['author'] = masterList[x][y][2] tempDict['title'] = masterList[x][y][0] tempDict['credRating'] = svm.compute( masterList[x][y][1]) if x is 0: tempDict['topic'] = 'general' if x is 1: tempDict['topic'] = 'technology' if x is 2: tempDict['topic'] = 'sports' if x is 3: tempDict['topic'] = 'business' if x is 4: tempDict['topic'] = 'entertainment' if x is 5: tempDict['topic'] = 'science' listInfo.append(tempDict) aggregatedDict[location] = listInfo except: None
def getNewsAPI(): articles = urllib2.urlopen("https://newsapi.org/v1/sources").read() articles = ast.literal_eval(articles) articles = articles.items()[1] articles = articles[1] counter = 0 for x in range(0, len(articles)): articleid = articles[x]['id'] topic = articles[x]['category'] if topic == 'entertainment' or (topic == 'gaming') or topic == 'music': topic = 'entertainment' if topic == 'science-and-nature': topic = 'science' payload = { 'source': articleid, 'apiKey': '6f62a98cbb734492abbdba50a4bdff86', 'sortBy': 'top' } r = requests.get('https://newsapi.org/v1/articles', params=payload) if str(r.json()['status']) == 'ok': presentArticles = json.loads(json.dumps(r.json()['articles'])) for y in range(0, len(presentArticles)): jsonReceived = presentArticles[y] title = jsonReceived[u'title'] author = jsonReceived[u'author'] url = jsonReceived[u'url'] credRating = -1 try: locations = nlp.HTMLParser(url) for z in range(0, len(locations)): location = str(getAddress(str(locations[z]))) if location not in aggregatedDict: aggregatedDict[location] = [] listInfo = aggregatedDict[location] tempDict = {} tempDict['url'] = url tempDict['author'] = author tempDict['title'] = title tempDict['credRating'] = svm.compute(url) tempDict['topic'] = topic listInfo.append(tempDict) aggregatedDict[location] = listInfo except: None
def manualSearch(query): url = 'https://api.cognitive.microsoft.com/bing/v5.0/news/search?q=' + query # query string parameters payload = {'q': query, 'freshness': 'Week'} # custom headers headers = {'Ocp-Apim-Subscription-Key': '22207001cbdc4c2487ad91d1cec1bdf2'} r = requests.get(url, params=payload, headers=headers) links = [] descriptions = [] print(r.json()) try: listOfArticles = r.json()['value'] except: return [] max = 5 for article in listOfArticles: if ('clusteredArticles' in article): information = article['clusteredArticles'] else: information = article thisList = [] if max == 0: break max -= 1 if (type(information) is dict): links.append(information['url']) descriptions.append( str(information['description'].encode("ascii", "ignore"))) fin = [] rating = 0.0 i = 0 for link in links: thisDict = {} rating = svm.compute(link) thisDict['id'] = str(i + 1) thisDict['description'] = descriptions[i] thisDict['url'] = link thisDict['score'] = str(rating) fin.append(thisDict) i = i + 1 return json.dumps(fin)
def processURL(url): toReturn = {} score = svm.compute(url) t = lxml.html.parse(url) title = t.find(".//title").text response = get(url) extractor = Goose() article = extractor.extract(raw_html=response.content) file = article.cleaned_text keywords = nlp.generateEntity(file) toReturn['title'] = title toReturn['score'] = score toReturn['keywords'] = keywords toReturn['url'] = url return json.dumps(toReturn)