def post(self, request, format=None): url = request.data.get('url') source = request.data.get('source') s = SummarizeUrl(url) foo = [] for x in s: foo.append(remove_non_ascii(x)) summary = ' '.join(foo) hdr = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive' } req = urllib2.Request(url, headers=hdr) page = urllib2.urlopen(req) p = parse(page) title = p.find(".//title").text data = { 'title': title, 'url': url, 'summarize_url': summary, 'source': source } serializer = SummarySerializer(data=data) if serializer.is_valid(): # serializer.save() return Response(serializer.data, status=status.HTTP_201_CREATED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def get_summary(url): summaries = SummarizeUrl(url) summary = [] for elem in summaries: summary.append(elem) print ' '.join(summary)
def catchContent(): number_bb = t.count( '(//div[contains(@data-vr-zone, "Top Stories")]//span[contains(@class, "story-headline")])' ) df_bb = pd.DataFrame(index=range(0, number_bb - 2), columns=['Sno', 'Title', 'URL', 'Summary', 'Img_URL']) for n in range(0, number_bb - 2): title = t.read( '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]' .format(n)) URL_b = t.read( '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]//@href' .format(n)) URL = "https://www.straitstimes.com/" + str(URL_b) Img_URL = t.read( '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]/ancestor::div[contains(@class, "body")]/..//img/@src' .format(n)) summaries = SummarizeUrl(URL) df_bb.iloc[n, 0] = n df_bb.iloc[n, 1] = title df_bb.iloc[n, 2] = URL df_bb.iloc[n, 3] = summaries df_bb.iloc[n, 4] = Img_URL return df_bb
def fact_extract(user_input): #user_input = str(raw_input('Enter input: ')) this will come from the front end instead of cli url = wikipedia.page(user_input).url # s = Summarize(user_input, wikipedia.page(user_input).content) is to summarize if the url is not available # this one is for extracting the content directly from the keyword entered by the user s = SummarizeUrl(url) summ = ' '.join(word for word in s) #summ = summ.replace("]","") #non reg ex way of dealing with expressions def removeNestedParentheses(s): ret = '' skip = 0 for i in s: if i == '[': skip += 1 elif i == ']' and skip > 0: skip -= 1 elif skip == 0: ret += i return ret summ = removeNestedParentheses(summ) summ = summ.replace(". ", ".\n\n") return summ
def get_data(rss, num): pathToCSV = '../fileStore/file.csv' data = [] with open(pathToCSV, 'w') as csvfile: wr = csv.writer(csvfile, delimiter='@', quotechar='#') index = 0 for e in rss['entries']: if (index == int(num)): break wr.writerow([(e['title']).encode('utf-8')]) wr.writerow([(e['link']).encode('utf-8')]) summary = [] try: for elem in SummarizeUrl(e['link'].encode('utf-8')): summary.append(elem) wr.writerow([ ' '.join(summary).encode('utf-8').strip().replace( '\n', '') ]) except TypeError: wr.writerow(['Summary Unavailable']) index = index + 1
def main(url): # open db connection conn = sqlite3.connect('../db/summaries.db') c = conn.cursor() # check if exists u = (url, ) rows = c.execute('SELECT COUNT(*) FROM summaries WHERE url=?', u).fetchone() if (rows[0] != 0): #print url + " was found in db" conn.close() return # add the new article summaries = SummarizeUrl(url) if summaries is None: conn.close() return sums = " ".join(summaries).replace('\n', '. ') stage = (url, sums, date.today()) c.execute('INSERT INTO summaries VALUES (?,?,?)', stage) # save & close conn.commit() conn.close() # delay so we dont overflow the site print url + " added" time.sleep(5)
def testURLs(self): urls = ( u'http://www.huffingtonpost.com/2013/11/22/twitter-forward-secrecy_n_4326599.html', u'http://www.bbc.co.uk/news/world-europe-30035666', u'http://www.bbc.co.uk/news/magazine-29631332') #just make sure it doesn't crash for url in urls: summaries = SummarizeUrl(url)
def fact_extract(user_input): url = wikipedia.page(user_input).url error = '' # s = Summarize(user_input, wikipedia.page(user_input).content) is to summarize if the url is not available # this one is for extracting the content directly from the keyword entered by the user s = SummarizeUrl(url) try: summ = ' '.join(word for word in s) except TypeError: return error summ = removeNestedParentheses(summ) summ = summ.replace(". ", ".\n\n") return summ
def summary(request): title = 'Summary' form = SummaryForm(request.POST or None) response_data = {} context = { "title": title, "form": form, } if request.method == 'POST': instance = form.save(commit=False) url = instance.url if form.is_valid(): summary = SummarizeUrl(url) instance.summarize_url = summary hdr = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive' } req = urllib2.Request(url, headers=hdr) page = urllib2.urlopen(req) p = parse(page) title = p.find(".//title").text instance.title = title # instance.save() response_data['url'] = instance.url response_data['title'] = title response_data['summary'] = instance.summarize_url context = { "title": title, "url": url, "summary": summary, } return JsonResponse(response_data) else: return JsonResponse({"nothing to see": "this isn't happening"}) else: form = SummaryForm() return render(request, "website/index.html", context)
def process(links): for s in range(1, 6): spath = "story" + str(s) + ".txt" delete_file_contents(spath) s = 1 i = 0 for link in links: try: i += 1 spath = "story" + str(s) + ".txt" summaries = SummarizeUrl(link) append_file(spath, summaries, 0) if i == 3: i = 0 s += 1 retrimmer(spath) except: pass
def run(): conn = util.create_connection("./db/news.db") site = util.getSiteByName(conn, "BBC News") site_url = site[0][2] site_id = site[0][0] results, url, number_BBC, Img_link = NewsFromBBC(site_url) df = pd.DataFrame(index=range(0,number_BBC-1), columns = ['Sno', 'Title', 'URL','Summary', 'Img_URL']) for i in range(1, number_BBC): summaries = SummarizeUrl(url[i]) df.iloc[i-1, 0] = i df.iloc[i-1, 1] = results[i] df.iloc[i-1, 2] = url[i] df.iloc[i-1, 3] = summaries df.iloc[i-1, 4] = Img_link[i] df = util.fixImgLink(df, "https://cf-templates-fghyux9ggb7t-ap-southeast-1.s3-ap-southeast-1.amazonaws.com/bbc.png") df = util.fixSummary(df) util.updateNews(conn, site_id, df)
def getRSSFeedAttributes(feed, dangerFactors, citiesUSA, c, returnList, Errors, feedErrors, linkErrors): a = feedparser.parse(feed) for posts in a.entries: now = datetime.datetime.now() Date_Of_Access = str(now) # if 'published' in posts: try: date_published = posts['published'] dtobject = parser.parse(date_published) date_published_reformated = dtobject.strftime('%Y-%m-%d %H:%M:%S') except Exception, e: date_published_reformated = None title = posts.title title = title.replace("'","''") if title: link = posts.link if link: factorList = [] domain = [] locationList = [] wordFreqFactor = [] wordFreqLocation = [] try: summary = (SummarizeUrl(link)) except Exception, e: SummaryError = str(e) Errors.append(SummaryError) feedErrors.append(feed) linkErrors.append(link) if summary: summary = u' '.join(summary) summary = summary.replace("'","''").replace("\n"," ") summaryLower = summary.lower() getFactorsFromText(summaryLower, summary, factorList, domain, locationList, wordFreqFactor, wordFreqLocation, dangerFactors, citiesUSA, c, title, link, date_published_reformated, Date_Of_Access, returnList, feed)
def catchContent(): number = t.count('(//li[contains(@class, "css-1iski2w")]/a)') df = pd.DataFrame(index=range(0,number), columns = ['Sno', 'Title', 'URL', 'Summary','Img_URL']) for n in range(1, number+1): title=t.read('//li[contains(@class, "css-1iski2w")][{}]/a/div'.format(n)) URL=t.read('//li[contains(@class, "css-1iski2w")][{}]//@href'.format(n)) Img_link=t.read('//li[contains(@class, "css-1iski2w")][{}]//img/@src'.format(n)) summaries = SummarizeUrl(URL) df.iloc[n-1, 0] = n df.iloc[n-1, 1] = title.decode('utf-8') df.iloc[n-1, 2] = URL df.iloc[n-1, 3] = summaries df.iloc[n-1, 4] = Img_link df['Summary'].replace('None', np.nan, inplace=True) df.dropna(subset=['Summary'], inplace=True, how='any') df= df.reset_index(drop=True) df['Sno'] = df.index return df
url_list.append('https://news.google.com/{}'.format(raw[i])) except IndexError: # Comment plez url_list.append('https://news.google.com/{}'.format(raw[i])) i += 1 for q in range(12): url_list.pop(0) return tuple(url_list) # def __init__(self, q): # # self.query = q # Comment this stuff out if __name__ == "__main__": # Use this Unicode format though or it will break Google query = "massachusetts" lis = scraper(query) urls = ( u'https://www.cnet.com/news/from-friday-to-pizza-rat-top-viral-videos-of-2010s/', u'http://www.bbc.co.uk/news/world-europe-30035666', u'http://www.bbc.co.uk/news/magazine-29631332') for url in urls: summaries = SummarizeUrl(url) pprint(summaries)
# Add random articles Urls = { 'Huffington': 'http://www.huffingtonpost.com/2013/11/22/twitter-forward-secrecy_n_4326599.html' } Urls.update({'BBC': 'http://www.bbc.co.uk/news/business-32817114'}) Urls.update({'BBC Tech': 'http://www.bbc.co.uk/news/technology-32814309'}) Urls.update({'BBC News': 'http://www.bbc.co.uk/news/uk-england-32822298'}) print('Summarising Websites\n\n') # Loop through each site in the Dictionary for Site in Urls: # Print the name of the Site in Use print('Website: %s') % (Site) # Use the SummarizeURL function summaries = SummarizeUrl(Urls[Site]) # Print Each Sentence on a new line for sentence in summaries: print sentence print('\n') ################# Output ################# # Importing Modules # # Adding Websites # # Summarising Websites # # Website: BBC # Separately, the Federal Reserve fined a sixth bank, Bank of America, $205m over foreign exchange-rigging.
def getNews(rssDict, service, searchedImages): try: startTime = time.time() directory = "./data/" + service + '/' # create directory for service if doesnt yet exist if not os.path.exists(directory): os.makedirs(directory) print('created directory: %s' % directory) # iterate each feed in service -> fetch data -> write to temp file -> copy to actual file for key, value in rssDict.items(): fileName = directory + key + "-write.json" # delete {category}-write.json file if one already exists if os.path.exists(fileName): os.remove(fileName) print('deleted existing file: %s' % fileName) feed = feedparser.parse(value) #parse feed to get all the posts feedDict = {} feedCounter = 0 # loop through posts in category for post in feed.entries[:20]: #limit to 20 entries per feed imgUrl = "none" # caching enabled. this prevents asking google for images every-time if post.link in searchedImages: imgUrl = searchedImages[post.link] print('found image in cache for %s. done!' % post.link) else: query = post.title.split() query = '+'.join(query) if (service == 'reuters'): imgSearch = ( "https://ajax.googleapis.com/ajax/services/search/images?v=1.0&q=" + query) else: imgSearch = ( "https://ajax.googleapis.com/ajax/services/search/images?v=1.0&q=" + service + "+" + query) imgSearchRequest = requests.get(imgSearch, verify=False) if (imgSearchRequest.status_code == 200): #on get success imgSearchData = imgSearchRequest.json() try: getNextImg = 1 imgUrl = imgSearchData['responseData']['results'][ 0]['url'] if (service == 'reuters'): imgUrl = urllib2.unquote(imgUrl) badBbcUrl = 'http://ichef.bbci.co.uk/news/660/media/images/80201000/png/_80201000_breaking_image_large-3.png' if (service == 'bbc' and imgUrl == badBbcUrl): imgUrl = imgSearchData['responseData'][ 'results'][1]['url'] getNextImg = 2 # check if select url is actually an image # if not, choose the next url if not 'image' in requests.get( imgUrl, verify=False).headers['content-type']: print( "MISSED FIRST IMG URL = BAD CONTENT. SECOND FETCH!" ) imgUrl = imgSearchData['responseData'][ 'results'][getNextImg]['url'] searchedImages[ post. link] = imgUrl # add to image cache if img found print( 'image not in cache but new one fetched for %s. done!' % post.link) except (TypeError, IndexError, requests.exceptions.MissingSchema): print('DENIAL FROM GOOGLE for %s. failed!' % post.link) imgUrl = "200F" else: imgUrl = "404" print( 'image not in cache. also couldnt fetch new one for %s. failed!' % post.link) summary = SummarizeUrl( post.link) # summarize text from article feedDict[feedCounter] = [ post.title, post.link, summary, imgUrl ] feedCounter += 1 # write the collected data to {category}-write.json in json format with open(fileName, 'w') as fp: json.dump(feedDict, fp) print('wrote file: %s' % fileName) # iterate through all cateogries and copy temp files to the actual files for key, value in rssDict.items(): source = directory + key + "-write.json" destination = directory + key + ".json" if os.path.exists(source): copyfile(source, destination) print('copied file: %s' % destination) else: print('cannot copy file: source %s not found' % source) print("--- %s seconds ---\n" % (time.time() - startTime)) #iteration runtime except Exception: print("EXCEPTION ERROR EXCEPTION ERROR!!") pass
len(monarchs) - len(featured) - 1) + ' features queued)' to = 'test-email' if production == True: to = 'Stevie Wonder Says <*****@*****.**>' subject = s messageText = random.choice([ "Today's subject is ", "This week's subject is ", "Today let's have a look at ", "Today's topic is ", "Today let's discuss " ]) messageText += s + '.<br /><br />Did you know...?<br /><br />' imageUrls = im.getNImages(s, 5) sentences = SummarizeUrl(wikipediaUrl) if sentences is None: sentences = ['Little is known about ' + s + '.'] * 5 for i in xrange(5): messageText += sentences[i].encode('utf-8') + '<br /><br />' messageText += '<center><img style="max-height: 480px; max-width: 480px;" alt="one of the best pictures of ' + s + '"" src="' + imageUrls[ i].encode('utf-8') + '" /></center><br /><br />' messageText += "That's all for now. Have a great week!<br /><br />Long live the queen,<br />Steve" # feel free to write back with feedback or suggestions on how to improve my newsletter message = sendEmail.CreateHTMLMessage(sender, to, subject, messageText) sendEmail.SendMessage(quickstart.gmail_service, 'me', message)
# End of get_json_files() if __name__ == "__main__": args = parse_arguments() json_files = get_json_files(args.dataset) total_summaries = [] # Summarizing from URL print("Summarizing from URLs") for current_file in [json_files[0]]: with open(current_file, 'r') as json_file: for line in json_file: record = json.loads(line) url = record['URL_s'] print("Summarizing...", url) summary_sentences = SummarizeUrl(url) if summary_sentences: total_summaries.append(" ".join(summary_sentences)) print("Done processing one file") print("Finished first pass through all records") print("Recombining and summarizing...") while len(total_summaries) > 15: summaries_to_join = int(len(total_summaries) / 15) if summaries_to_join == 1: break if summaries_to_join > 20: summaries_to_join = 20 combined_summaries = [ " ".join(total_summaries[i:i + summaries_to_join]) for i in range(0, len(total_summaries), summaries_to_join)
def longSummaryFromQuery(query): g = d.get_zci(query, True) if "(" in g: return Summarize(query, g, 15) else: return SummarizeUrl(g, 15)
#rss = 'https://www.incibe-cert.es/bfeed/avisos-sci/all' b = float('0.0') lenlista = len(listaulr) for rss in listaulr : print((b/lenlista)*100) if hasattr(ssl, '_create_unverified_context'): ssl._create_default_https_context = ssl._create_unverified_context d = feedparser.parse(rss.replace('"','')) #<<WORKS!! if (len(d.entries) > 10) : ent = 10 else: ent = len(d.entries) coda = Coda('c5d228f1-a38e-4956-b6e6-f6ceaac089d0') for i in range(0,int(ent)) : try: summaries = SummarizeUrl(d.entries[i].link) payload = { 'rows': [ { 'cells': [ {'column': 'c-_zKlSZgEHp', 'value': d.entries[i].title}, {'column': 'c-FaaJIgjDAb', 'value': d.entries[i].link}, {'column': 'c-MM-bPQUWUb', 'value': summaries}, ], }, ], "keyColumns" : [ "c-_zKlSZgEHp" ] } print(payload) coda.upsert_row('G6ody-h41g','grid-30lv9fOPY1',payload) except:
def wholeArticleFromQuery(query): g = d.get_zci(query, True) if "(" in g: return Summarize(query, g, 1500) else: return SummarizeUrl(g, 1500)
def tease_url(url): summaries = SummarizeUrl(url) print(summaries)
def summarize(): url = urllib2.unquote(request.args.get('u') or '') summaries = SummarizeUrl(url) return jsonify(result=summaries)