def put_daily(cur, date): put_counts(cur, date) put_actives(cur, date, False) #if date.weekday() == weekday_saturday: # put_actives(cur, date, True) put_crashes(cur, date) summarize(date)
def index(): errors = "" if request.method == 'GET': return render_template("index.html",errors= errors) else : query = "" query = request.form['name'] if query == "": return "No Query " url = "http://en.wikipedia.org/wiki/"+ query.lower() ''' word = Word(query) word.synsets[:5] defi = word.definitions[0]() ''' defi = "" summary = "" if defi : summary += defi text = summarize.summarize(url, query.lower()) summary += text '''word = summary.split() sent = "" for w in word : sent += word + "%20"''' return render_template("index.html",summary = summary)
def main(argv): control = make_control(argv) sys.stdout = Logger.Logger(base_name=control.arg.base_name) print control in_df = pd.read_csv(control.path_in, nrows=1000 if control.test else None, ) summary_df = summarize.summarize(in_df) report_summary = make_report(summary_df) # TODO: print correlations of each variable with price print summary_df # write output files summary_df.to_csv(control.path_out_summary) f = open(control.path_out_report, 'wb') pickle.dump((report_summary, control), f) f.close() if control.test: print 'DISCARD OUTPUT: TESTING' print control print 'done'
def GET(self,key): #文件名 fname=cwd("static","files", "cluster",key) res={} import os,json if not os.path.isfile(fname): return json.dumps({"error":"file not found"}) web.header('Content-Type', 'application/json') sentence=file(fname,'r').read() tags=jieba.analyse.extract_tags(sentence,10) words = jieba.cut(sentence) freq = {} total=0.0 #todo:从文件导入停用词 stop_words= set([ "where","the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that" ]) #统计词频 for w in words: if len(w.strip())<2: continue if w.lower() in stop_words: continue freq[w]=freq.get(w,0.0)+1.0 total+=freq[w] tags=dict([(x,freq[x]) for x in tags]) summary=summarize.summarize(sentence) #\n换成<br>,为了在html中显示 summary=summary.replace('\n',"<br>") # print summary return json.dumps({"keyword":tags,"summary":summary})
def test_that_it_runs(self): text = summarize( "Alice and Bob are friends. Alice is fun and cuddly." " Bob is cute and quirky. Together they go on wonderful" " adventures in the land of tomorrow. Alice's cuddlines" " and Bob's cuteness allow them to reach their goals." " But before they get to them, they have to go past their" " mortal enemy — Mr. Boredom. He is ugly and mean. They" " will surely defeat him. He is no match for their abilities.") self.assertTrue(bool(text))
def test_when_there_arent_any_words_in_common(self): text = ( "Alice is awesome. I'm hot and you're not. This is pretty sick. " "We are all divisive. Nothing common between these sentences. " "And here's one more example of that happening." ) summary = summarize(text) self.assertEqual( summary, "Alice is awesome. I'm hot and you're not. This is pretty sick. " "We are all divisive. Nothing common between these sentences." )
def index(): text = request.forms.getunicode('text') number = int_or_none(request.forms.get('number')) language = request.forms.get('language') or 'english' result = summarize(text, number, language) if number and text else None return { 'text': text or DEMO_TEXT, 'result': result, 'number': number or 5, 'language': language, 'available_languages': LANGUAGES }
def main(): # Create a parser object to handle passing arguments through the command line parser = argparse.ArgumentParser(description='TextNow Coffee Tasting') subparsers = parser.add_subparsers(dest='command', help='command') # Define 3 parser objects for the 3 different operations that are available commands = ['parse', 'summarize', 'recommend'] parsers = {c: subparsers.add_parser(c) for c in commands} parsers['parse'].add_argument('arg', help='coffee descriptive name') parsers['summarize'].add_argument('arg', help='input csv file', type=argparse.FileType('r')) parsers['recommend'].add_argument('arg', help='input csv file', type=argparse.FileType('r')) args = parser.parse_args() if args.command == 'parse': coffee = Coffee.fromname(name=args.arg) coffee.display() if args.command == 'summarize': summarize.summarize(args.arg) if args.command == 'recommend': cf.recommend(args.arg)
def summarize_official(): """ If there is only one document, gets the summary of that document. Otherwise, asks the user which document do they want to summarize. Return: Summary or a question of which doc do they want to summarize """ # print(len(TotalDocs), flush = True) if (len(TotalDocs) == 1): summary = summarize(db, TotalDocs[0], stopwords, summarizeLength = 2) # print(len(summary), flush = True) msg = "The summary is: {}".format(summary) return statement(msg) else: return question("Which document? Give a number")
def summarizeRightDoc(Number): """ Parameter: The number corresponding to the document in which the user wants a summary Goes to the corresponding document and gets the summary for that document Return: Summary of specified document """ Number = "{}".format(Number) # print(Number, flush = True) documentID = TotalDocs[int(Number) - 1] summary = summarize(db, documentID, stopwords) print("{}".format(summary), flush = True) msg = "The summary is: {}".format(summary) return statement(msg)
def make_summary(pmid=None): r = requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/' 'elink.fcgi?dbfrom=pubmed&id=%d&cmd=prlinks' '&retmode=json' % pmid) body = None if r.status_code == 200: xml = etree.fromstring(r.text) try: url = xml.xpath('//Url')[0].text full_text_r = requests.get(url) article = BS(full_text_r.text) paragraphs = article.findAll(['p']) body = ' '.join([p.text for p in paragraphs]) summary = summarize(body, pmid) except: summary = ['PubMed provided no full text URL for PMID %d' %pmid] url = None return summary, url
def genTextMetrics(raw_text): summary = summaryEngine.summarize(raw_text) svo = textEngine.extract(summary) final_text_data = { "summary": summary, "svo_data": [] } for scene in svo: # print scene sent_subject = scene["raw_subject"] if len(scene["simple_subject"]) == 0 else scene["simple_subject"] sent_object = scene["raw_object"] if len(scene["simple_object"]) == 0 else scene["simple_object"] sent_predicate = scene["predicate"] file_urls = {} file_urls["subject"] = getImageFromString(sent_subject) file_urls["verb"] = getImageFromString(sent_predicate) if len(sent_object) != 0: # print "OBJECT" file_urls["object"] = getImageFromString(sent_object) sent_data = { "subject": { "text": sent_subject, "image": file_urls["subject"] }, "verb": { "text": sent_predicate, "image": file_urls["verb"] }, "object": { "text": sent_object, "image": file_urls["object"] if len(sent_object) != 0 else None } } final_text_data["svo_data"].append(sent_data) return final_text_data
from summarize import summarize text = "Alice and Bob are friends. Alice is fun and cuddly. Bob is cute and quirky. Together they go on wonderful adventures in the land of tomorrow. Alice's cuddliness and Bob's cuteness allow them to reach their goals. But before they get to them, they have to go past their mortal enemy — Mr. Boredom. He is ugly and mean. They will surely defeat him. He is no match for their abilities." sentence_count = 2 language = 'english' summary = summarize(text, sentence_count, language='english') print(summary)
def process(): name = request.form['name'] if name: return jsonify({'name': summarize.summarize(name)}) return jsonify({'error': 'Missing data!'})
def test_single_sentence(self): text = "Alice is awesome" summary = summarize(text) self.assertEqual(text, summary)
def test_doesnt_crash_on_empty_sentences(self): try: summarize('. . .') except Exception as e: self.fail(e)
from history import History import scrape from summarize import summarize if __name__ == '__main__': history = History() target = scrape.get_article(scrape.url) print('\n\n\n') print(summarize(target[3], 1.25)) # This application is a work in progress and not meant to be run aside from testing purposes.
def get_summary(text): return summarize(text,sentence_count=5, language='spanish')
import tag import generate __author__ = "imdreamrunner" __email__ = "*****@*****.**" logging.basicConfig(level=logging.DEBUG) log = logging.getLogger(__name__) if __name__ == "__main__": print("Welcome to Reader") args = sys.argv if len(args) != 2: print("Usage: python reader.py <fetch|generate>") exit(1) command = args[1] if command == "fetch": log.debug("Command: fetch") fetch.fetch_all() summarize.summarize() translate.translate() tag.tag() elif command == "generate": log.debug("Command: generate") generate.generate() else: print("Unknown command: " + command) log.info("Program exits.")
n_estimators = 1000 max_features = 55 ## max_features = int(X.shape[1]) ## max_features='auto' print 'Constructing random forest classifier from training set...' sys.stdout.flush() time0 = time.time() rfor = ensemble.RandomForestClassifier(n_estimators=n_estimators,max_features=max_features,n_jobs=-1) rfor = rfor.fit(X, Y) dt = time.time() - time0 print ' that took %.1f seconds.\n' % dt Y_pred = rfor.predict(X) ## print 'Training sample:' ## summarize(Y,Y_pred) Ytest_pred = rfor.predict(Xtest) print 'Test sample:' summarize(Ytest,Ytest_pred) ## del X,Y,Y_pred ## del Xtest,Ytest,Ytest_pred ## del rfor
from tfidf import TfidfModel from summarize import summarize from lib.db import load_docs_for_training, load_reviews_and_split_to_sentences docs = load_docs_for_training() tfidf = TfidfModel() model, dictionary = tfidf.generate(docs) target_id = 0 sentences_unfiltered = load_reviews_and_split_to_sentences(target_id) summary_sentences = summarize(sentences_unfiltered, model, dictionary, max_characters=70, user_mmr=True, sent_limit=50) for sentence in summary_sentences: print(sentence.strip())
def run(num=None): feeds, feedfileObject = load() mailserver = None try: # We store the default to address as the first item in the feeds list. # Here we take it out and save it for later. default_to = "" if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] else: ifeeds = feeds if num: ifeeds = [feeds[num]] feednum = 0 for f in ifeeds: try: feednum += 1 if not f.active: continue if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url) r = {} try: r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified) except TimeoutError: print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url) continue # Handle various status conditions, as required if 'status' in r: if r.status == 301: f.url = r['url'] elif r.status == 410: print >>warn, "W: feed gone; deleting", f.url feeds.remove(f) continue http_status = r.get('status', 200) if VERBOSE > 1: print >>warn, "I: http status", http_status http_headers = r.get('headers', { 'content-type': 'application/rss+xml', 'content-length':'1'}) exc_type = r.get("bozo_exception", Exception()).__class__ if http_status != 304 and not r.entries and not r.get('version', ''): if http_status not in [200, 302]: print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url) elif contains(http_headers.get('content-type', 'rss'), 'html'): print >>warn, "W: looks like HTML [%d] %s" % (feednum, f.url) elif http_headers.get('content-length', '1') == '0': print >>warn, "W: empty page [%d] %s" % (feednum, f.url) elif hasattr(socket, 'timeout') and exc_type == socket.timeout: print >>warn, "W: timed out on [%d] %s" % (feednum, f.url) elif exc_type == IOError: print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url) elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error: print >>warn, "W: broken compression [%d] %s" % (feednum, f.url) elif exc_type in socket_errors: exc_reason = r.bozo_exception.args[1] print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == urllib2.URLError: if r.bozo_exception.reason.__class__ in socket_errors: exc_reason = r.bozo_exception.reason.args[1] else: exc_reason = r.bozo_exception.reason print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == AttributeError: print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url) elif exc_type == KeyboardInterrupt: raise r.bozo_exception elif r.bozo: print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process")) else: print >>warn, "=== rss2email encountered a problem with this feed ===" print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >>warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url print >>warn, r print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue r.entries.reverse() for entry in r.entries: id = getID(entry) # If TRUST_GUID isn't set, we get back hashes of the content. # Instead of letting these run wild, we put them in context # by associating them with the actual ID (if it exists). frameid = entry.get('id') if not(frameid): frameid = id if type(frameid) is DictType: frameid = frameid.values()[0] # If this item's ID is in our database # then it's already been sent # and we don't need to do anything more. if frameid in f.seen: if f.seen[frameid] == id: continue if not (f.to or default_to): print "No default email address defined. Please run 'r2e email emailaddress'" print "Ignoring feed %s" % f.url break if 'title_detail' in entry and entry.title_detail: title = entry.title_detail.value if contains(entry.title_detail.type, 'html'): title = html2text(title) else: title = getContent(entry)[:70] title = title.replace("\n", " ").strip() when = time.gmtime() if DATE_HEADER: for datetype in DATE_HEADER_ORDER: kind = datetype+"_parsed" if kind in entry and entry[kind]: when = entry[kind] link = entry.get('link', "") from_addr = getEmail(r, entry) name = h2t.unescape(getName(r, entry)) fromhdr = formataddr((name, from_addr,)) tohdr = (f.to or default_to) subjecthdr = title datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", when) useragenthdr = "rss2email" # Add post tags, if available tagline = "" if 'tags' in entry: tags = entry.get('tags') taglist = [] if tags: for tag in tags: taglist.append(tag['term']) if taglist: tagline = ",".join(taglist) extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'Message-ID': '<%s>' % hashlib.sha1(id.encode('utf-8')).hexdigest(), 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline, 'X-MUNGED-FROM': getMungedFrom(r), 'References': ''} if BONUS_HEADER != '': for hdr in BONUS_HEADER.strip().splitlines(): pos = hdr.strip().find(':') if pos > 0: extraheaders[hdr[:pos]] = hdr[pos+1:].strip() else: print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER entrycontent = getContent(entry, HTMLOK=HTML_MAIL) contenttype = 'plain' content = '' if THREAD_ON_TAGS and len(tagline): extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(t.strip().encode('utf-8')).hexdigest() for t in tagline.split(',')]) if USE_CSS_STYLING and HTML_MAIL: contenttype = 'html' content = "<html>\n" content += '<head><meta http-equiv="Content-Type" content="text/html"><style>' + STYLE_SHEET + '</style></head>\n' content += '<body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">\n' content += '<div id="entry">\n' content += '<h1 class="header"' content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n' if ishtml(entrycontent): body = entrycontent[1].strip() if SUMMARIZE: content += '<div class="summary">%s</div>' % (summarize(html2text(body, plaintext=True), SUMMARIZE) + "<hr>") else: body = entrycontent.strip() if SUMMARIZE: content += '<div class="summary">%s</div>' % (summarize(body, SUMMARIZE) + "<hr>") if THREAD_ON_LINKS: parser = Parser() parser.feed(body) extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(h.strip().encode('utf-8')).hexdigest() for h in parser.attrs]) if INLINE_IMAGES_DATA_URI: parser = Parser(tag='img', attr='src') parser.feed(body) for src in parser.attrs: try: img = feedparser._open_resource(src, None, None, feedparser.USER_AGENT, link, [], {}) data = img.read() if hasattr(img, 'headers'): headers = dict((k.lower(), v) for k, v in dict(img.headers).items()) ctype = headers.get('content-type', None) if ctype and INLINE_IMAGES_DATA_URI: body = body.replace(src,'data:%s;base64,%s' % (ctype, base64.b64encode(data))) except: print >>warn, "Could not load image: %s" % src pass if body != '': content += '<div id="body">\n' + body + '</div>\n' content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>' if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if (hasattr(enclosure, 'url') and enclosure.url != ""): content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n") if (hasattr(enclosure, 'src') and enclosure.src != ""): content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n') if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': extraurl = extralink['href'] extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/') viatitle = extraurl if ('title' in extralink): viatitle = extralink['title'] content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n' content += '</p></div>\n' content += "\n\n</body></html>" else: if ishtml(entrycontent): contenttype = 'html' content = "<html>\n" content = ("<html><body>\n\n" + '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' + entrycontent[1].strip() + # drop type tag (HACK: bad abstraction) '<p>URL: <a href="'+link+'">'+link+'</a></p>' ) if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n' content += ("\n</body></html>") else: content = entrycontent.strip() + "\n\nURL: "+link if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('\nEnclosure: ' + enclosure.url + "\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n' mailserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, when, extraheaders, mailserver, f.folder) f.seen[frameid] = id f.etag, f.modified = r.get('etag', None), r.get('modified', None) except (KeyboardInterrupt, SystemExit): raise except: print >>warn, "=== rss2email encountered a problem with this feed ===" print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >>warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >>warn, "E: could not parse", f.url traceback.print_exc(file=warn) print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue finally: unlock(feeds, feedfileObject) if mailserver: if IMAP_MARK_AS_READ: for folder in IMAP_MARK_AS_READ: mailserver.select(folder) res, data = mailserver.search(None, '(UNSEEN UNFLAGGED)') if res == 'OK': items = data[0].split() for i in items: res, data = mailserver.fetch(i, "(UID)") if data[0]: u = uid(data[0]) res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Seen)') if IMAP_MOVE_READ_TO: typ, data = mailserver.list(pattern='*') # Parse folder listing as a CSV dialect (automatically removes quotes) reader = csv.reader(StringIO.StringIO('\n'.join(data)),dialect='mailboxlist') # Iterate over each folder for row in reader: folder = row[-1:][0] if folder == IMAP_MOVE_READ_TO or '\Noselect' in row[0]: continue mailserver.select(folder) yesterday = (datetime.now() - timedelta(days=1)).strftime("%d-%b-%Y") res, data = mailserver.search(None, '(SEEN BEFORE %s UNFLAGGED)' % yesterday) if res == 'OK': items = data[0].split() for i in items: res, data = mailserver.fetch(i, "(UID)") if data[0]: u = uid(data[0]) res, data = mailserver.uid('COPY', u, IMAP_MOVE_READ_TO) if res == 'OK': res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Deleted)') mailserver.expunge() try: mailserver.quit() except: mailserver.logout()
def summarize_text(): to_summarize = str(haven.get_text(request.form['url'])) return summarize.summarize(to_summarize)
def get_concepts(): url = request.form['url'] data = json.dumps(haven.analysis(request.form['url'], False)) summary = str(haven.get_text(request.form['url'])) return render_template('learn.html', data=data, url=url, summary=summarize.summarize(summary))
def test_summarize_custom(): assert summarize.summarize([1, 1, 1, 1, 1]) == 1
def process_request(): if request.method == 'POST': url = request.data.get("url_field") #print(url) key = request.data.get("keywords") key = key[1:-1].split(",") keywords=[] for k in key: keywords.append(k[1:-1]) r = requests.get(url) soup = BeautifulSoup(r.content, "html.parser") categories = ['new act', 'new rule', 'new regulation', 'notification', 'circular', 'press release', 'scheme', 'order', 'ordinance', 'amendment', 'resolution', 'bill', 'report', 'guideline', 'direction', 'clarification', 'master direction','revised'] cat_map = dict() for k in keywords: india_links = lambda tag: (getattr(tag, 'name', None) == 'a' and 'href' in tag.attrs and k in tag.get_text().lower()) results = soup.find_all(india_links) extracted = [] for i in results: p = i.get('href') i.find('title') l=[] l.append(i.contents[0]) l.append(p) extracted.append(l) for z in extracted: flag = 0 for cat in categories: if cat in z[0].lower(): if cat in cat_map: cat_map[cat].append(z) flag = 1 break else: cat_map[cat] = [z] flag = 1 break if flag == 0: if 'others' in cat_map: cat_map['others'].append(z) flag = 1 else: cat_map['others'] = [z] absUrl = 'http://www.sebi.gov.in/' for k, v in cat_map.items(): for q in range (len(v)): url = v[q][1] r = requests.get(url) soup = BeautifulSoup(r.content, "html.parser") for i in soup.find_all('iframe'): innerLinks = i.get('src') pdfLink = absUrl + innerLinks[28:] pdfLink = str(pdfLink) print (pdfLink) # url = 'http://www.sebi.gov.in/web/?file=../../../sebi_data/attachdocs/nov-2017/1509707086156.pdf' url = pdfLink # writer = PdfFileWriter() pdf = pdfx.PDFx(url) # metadata = pdf.get_metadata() references_dict = pdf.get_references_as_dict() metadata = pdf.get_metadata() text = pdf.get_text() z = summarize(text, sentence_count=4, language='english') v[q].append(z) v[q].append(references_dict) v[q].append(metadata) return cat_map, status.HTTP_200_OK
def run_model(destination, subsample=None, min_Nflights=None): ### Unpickle datasets print 'Unpickling datasets...' time0 = time.time() filename = '/data/DelayMeNot/data/pickles_by_destination/datasets_%s.pkl' % destination (data_train, data_test) = cd.load_from_pickle(filename, gzip=True) print ' that took %.1f seconds.' % (time.time() - time0) ### subsample training data if subsample: if len(data_train) > 1e6: Nsub = int( float(len(data_train)) / 1e6 ) data_train = data_train.ix[::Nsub] ### remove all routes with less than min_Nflights flights grouped = data_train.groupby('Origin') Nflights = grouped['Origin'].count() Nflights.sort(ascending=True) if min_Nflights: orig_list = list(Nflights[Nflights > min_Nflights].index) if len(orig_list) == 0: print 'Found no routes with more than %d flights!' % min_Nflights return 0 data_train = data_train[data_train['Origin'].isin(orig_list)] data_test = data_test[data_test['Origin'].isin(orig_list)] else: orig_list = list(Nflights.index) ## return data_train, data_test ### "Dummify" the categorical 'Carrier' and 'Origin' columns, ### and add the dummies to the table, but drop the first dummy ### column to avoid "dummy variable trap". print 'Dummifying datasets...' time0 = time.time() dummies = pd.get_dummies(data_train['Carrier'],prefix='Carrier') data_train = data_train.join(dummies.ix[:,1:]) dummies = pd.get_dummies(data_test['Carrier'],prefix='Carrier') data_test = data_test.join(dummies.ix[:,1:]) dummies = pd.get_dummies(data_train['Origin'],prefix='Origin') data_train = data_train.join(dummies.ix[:,1:]) dummies = pd.get_dummies(data_test['Origin'],prefix='Origin') data_test = data_test.join(dummies.ix[:,1:]) ### Drop dummified columns data_train = data_train.drop(['Carrier','Origin'],axis=1) data_test = data_test.drop(['Carrier','Origin'],axis=1) print ' that took %.1f seconds.' % (time.time() - time0) ### Training set columns train_cols = list(data_train.columns) train_cols.remove('ArrivalDelay') ### Add any missing training columns to test dataset test_cols = list(data_test.columns) for tc in train_cols: if tc not in test_cols: data_test[tc] = np.zeros_like(data_test[test_cols[0]]) ### Define training and test data set variables late_delay = 30.0 X = data_train[train_cols].values.copy() Y = np.zeros_like(data_train['ArrivalDelay'].values) Y[data_train['ArrivalDelay'].values > late_delay] = 1 Xtest = data_test[train_cols].values.copy() Ytest = np.zeros_like(data_test['ArrivalDelay'].values) Ytest[data_test['ArrivalDelay'].values > late_delay] = 1 del data_train, data_test ### Train the RandomForest model ## n_estimators = 1000 n_estimators = 128 ## max_features = 'auto' max_features = int(X.shape[1]/2) print 'Constructing random forest classifier from training set...' print ' Number of flights in training data set = %d' % len(Y) sys.stdout.flush() time0 = time.time() rfor = ensemble.RandomForestClassifier(n_estimators=n_estimators,max_features=max_features,n_jobs=8) rfor = rfor.fit(X, Y) rfor.n_jobs = 1 Y_pred = rfor.predict(X) train_summary = summarize(Y,Y_pred) dt_train = time.time() - time0 print ' that took %.1f seconds.\n' % dt_train sys.stdout.flush() ### Test the model print 'Testing the model...' time0 = time.time() Ytest_pred = rfor.predict(Xtest) test_summary = summarize(Ytest,Ytest_pred) dt_test = (time.time() - time0) print ' that took %.1f seconds.' % dt_test sys.stdout.flush() ### Construct model summary dict model_summary = {} model_summary['training_columns'] = train_cols model_summary['training'] = train_summary model_summary['time_to_train'] = dt_train model_summary['test'] = test_summary model_summary['time_to_test'] = dt_test model_summary['late_delay'] = late_delay if subsample: model_summary['subsample'] = True model_summary['Nsub'] = Nsub else: model_summary['subsample'] = False model_summary['min_Nflights'] = min_Nflights model_summary['n_estimators'] = n_estimators model_summary['max_features'] = max_features ### Pickle the result print 'Pickling the result...' time0 = time.time() filename = '../RandomForest_models/by_destination/rfm_%s.pkl' % destination f = open(filename, 'wb') cPickle.dump((rfor,model_summary),f,2) f.close() subprocess.call('gzip %s' % filename, shell=True) print ' that took %.1f seconds.' % (time.time() - time0)
def queryToDocument(): """ Uses the inputted query and returns all the documents relating to the query. It then prompts the user if they want a summary of a document. Return: All documents relating to query and a question if the user wants a summary """ finalDocs = db.engine.query(Query, 3) finalDocs = [i[0] for i in finalDocs] print(finalDocs, flush = True) wikipediaString = "wikipedia" cnnString = "cnn" reuterString = "reteurs" global TotalDocs if len(finalDocs) == 1: if wikipediaString in finalDocs[0]: source = wikipediaString elif cnnString in finalDocs[0]: source = cnnString elif reuterString in finalDocs[0]: source = reuterString summary = summarize(db, finalDocs[0], stopwords) title = getTitle() image_msg = "The top document is {}".format(title) + " from {}".format(source) + "." image_msg += " Would you like a summary of this document?" TotalDocs = finalDocs return question(image_msg) elif len(finalDocs) == 2: if wikipediaString in finalDocs[0]: source1 = wikipediaString if cnnString in finalDocs[0]: source1 = cnnString if reuterString in finalDocs[0]: source1 = reuterString if wikipediaString in finalDocs[1]: source2 = wikipediaString if cnnString in finalDocs[1]: source2 = cnnString if reuterString in finalDocs[1]: source2 = reuterString filler = summarize(db, finalDocs[0], stopwords) title1 = getTitle() filler2 = summarize(db, finalDocs[1], stopwords)[0] title2 = getTitle() image_msg = "The top documents are {}".format(finalDocs[0]) + " from {}".format(source1) + " and " + "{}".format(finalDocs[1]) + " from {}".format(source2) image_msg += " Would you like a summary of a document?" TotalDocs = finalDocs return statement(image_msg) image_msg = "The top documents are " for i in range(len(finalDocs) - 1): if wikipediaString in finalDocs[i]: source = wikipediaString elif cnnString in finalDocs[i]: source = cnnString elif reuterString in finalDocs[i]: source = reuterString #filler = summarize(db, finalDocs[i], stopwords, summarizeLength = 5) #title = getTitle() image_msg += "{}".format(finalDocs[i]) + " from {}".format(source) image_msg += ", " image_msg += "and " fillerrr = summarize(db, finalDocs[-1], stopwords) titleLast = getTitle() image_msg += "{}".format(titleLast) image_msg += " Would you like a summary of a document?" TotalDocs = finalDocs return statement(image_msg)
def test_summarize_seed(): np.random.seed(5) numbers = summarize.gen_numbers(5) assert summarize.summarize(numbers) == np.mean([99, 78, 61, 16, 73])
def createsummary(options, totalprocs, procid): procidstr = "%s of %s " % (procid, totalprocs) if totalprocs != None else "" logging.info("Processor " + procidstr + "starting") referencetime = int(time.time()) - ( 7 * 24 * 3600 ) config = account.getconfig(options['config']) dbconf = config['accountdatabase'] outdb = output.factory(config['outputdatabase']) ratecalc = RateCalculator(procid) timewindows = dict() for resourcename, settings in config['resources'].iteritems(): if 'enabled' in settings: if settings['enabled'] == False: continue if options['resource'] not in (None, resourcename, str(settings['resource_id'])): continue processtimes = { "mintime": 2**64, "maxtime": 0 } dbreader = account.DbAcct( settings['resource_id'], dbconf, PROCESS_VERSION, totalprocs, procid, options['localjobid']) bacct = batch_acct.factory(settings['batch_system'], settings['acct_path'], settings['host_name_ext'] ) if settings['lariat_path'] != "": lariat = summarize.LariatManager(settings['lariat_path']) else: lariat = None dbwriter = account.DbLogger( dbconf["dbname"], dbconf["tablename"], dbconf["defaultsfile"] ) for acct in dbreader.reader(): logging.debug("%s local_job_id = %s", resourcename, acct['id']) job = job_stats.from_acct( acct, settings['tacc_stats_home'], settings['host_list_dir'], bacct ) summary,timeseries = summarize.summarize(job, lariat) insertOk = outdb.insert(resourcename, summary, timeseries) if summary['complete'] == False and summary["acct"]['end_time'] > referencetime: # Do not mark incomplete jobs as done unless they are older than the # reference time (which defaults to 7 days ago) dbwriter.logprocessed(acct, settings['resource_id'], ERROR_INCOMPLETE) continue if insertOk: dbwriter.logprocessed( acct, settings['resource_id'], PROCESS_VERSION ) processtimes['mintime'] = min( processtimes['mintime'], summary["acct"]['end_time'] ) processtimes['maxtime'] = max( processtimes['maxtime'], summary["acct"]['end_time'] ) ratecalc.increment() else: # Mark as negative process version to indicate that it has been processed # but no summary was output dbwriter.logprocessed( acct, settings['resource_id'], 0 - PROCESS_VERSION ) if processtimes['maxtime'] != 0: timewindows[resourcename] = processtimes logging.info("Processor " + procidstr + "exiting. Processed %s", ratecalc.count) if ratecalc.count == 0: # No need to generate a report if no docs were processed return proc = { "host": socket.getfqdn(), "instance": procid, "totalinstances": totalprocs, "start_time": ratecalc.starttime, "end_time": time.time() , "rate": ratecalc.rate, "records": ratecalc.count } report = { "proc": proc, "resources": timewindows } outdb.logreport(report)
def main(): # Import the self-created "volumes" module and the given "summarize" module. import volumes import summarize validInput = ["cube", "c", "pyramid", "p", "ellipsoid", "e", "quit", "q"] # All valid user inputs when asking for shape valid = False # Used to keep while loops running until valid input isShape = True # Used to see if user input is a shape or quit index = 0 # Tracks the index of user input within "validInput" list # Lists of all the shapes to keep track of calculated volumes cubeVolumes = [] pyramidVolumes = [] ellipsoidVolumes = [] # Introduction print("~Welcome to the Volume Calculator.~") print("") print("") # While loop to ask for a valid test case number while not valid: testCase = input( "Enter the test case number: ") # Ask for test case number # If the input is an integer, set the testCase variable to input and change 'valid' to True to exit while loop if testCase.isnumeric(): testCase = int(testCase) valid = True else: print( "Sorry, the test case must be a number." ) # If the input is not an integer, print message and loop again valid = False # Reset 'valid' for next while loop # While loop that keeps running until the user enters 'quit' or 'q' while isShape: # While loop that asks for user input, calculates volume (if necessary) and keeps running until quit input. while not valid: shape = str(input("Please enter a shape: ")) # Ask for input shape = shape.lower() # Convert to lower case # If the input is within the list of valid inputs, track down the index within the list and set 'valid' # to true. This allows the program to exit the while loop. if shape in validInput: index = validInput.index(shape) valid = True else: print( "Sorry, your input is invalid." ) # Print error message for invalid input, and loop again print("") # If the user input was "cube" or "c", perform necessary actions. if index in range(0, 2): sideLength = int(input("Enter the side length of the cube: ") ) # Ask the user for side length currentVolume = volumes.cubeVolume( sideLength ) # Calculate the volume by sending sidelength to method # "cubeVolume" within 'volumes' module. cubeVolumes.append( currentVolume) # Add the volume to the cubes list # If the user input was "pyramid" or "p", perform necessary actions. elif index in range(2, 4): baseLength = int(input("Enter the base length of the pyramid: ") ) # Ask the user for base length height = int(input("Enter the height of the pyramid: ") ) # Ask the user for height currentVolume = volumes.pyramidVolume( baseLength, height) # Calculate the volume by sending both base and # height to method "pyramidVolume" within # 'volumes' module. pyramidVolumes.append( currentVolume) # Add the volume to the pyramids list # If the user input was "ellipsoid" or "e", perform necessary actions. elif index in range(4, 6): # Ask for the three radii of the ellipsoid radius1 = int(input("Enter the first radius: ")) radius2 = int(input("Enter the second radius: ")) radius3 = int(input("Enter the third radius: ")) currentVolume = volumes.ellipsoidVolume( radius1, radius2, radius3) # Calculate the volume by sending the # 3 radii to method 'ellipsoidVolume' # within 'volumes' module. ellipsoidVolumes.append( currentVolume) # Add the volume to the ellipsoids list. # If the user input was "quit" or "q", change 'isShape' to false to allow program to exit loop elif index in range(6, 8): isShape = False valid = False # Reset 'valid' to false, allowing program to loop through asking user for shape input again # if necessary. # Sort the volumes within each shape volume list in ascending order. cubeVolumes.sort() pyramidVolumes.sort() ellipsoidVolumes.sort() # Notify that the session has finished. print("") print("") print("You have reached the end of your session.") # If the user has not performed any calculations, print appropriate message if len(cubeVolumes) == 0 and len(pyramidVolumes) == 0 and len( ellipsoidVolumes) == 0: print("You did not perform any volume calculations.") else: print("The volumes calculated for each shape are:") # If there are calculated cube volumes, print them. if len(cubeVolumes) != 0: print("Cube: ", cubeVolumes) else: print("Cube: No shapes entered" ) # If there are no cube volumes, print appropriate message # If there are calculated pyramid volumes, print them. if len(pyramidVolumes) != 0: print("Pyramid: ", pyramidVolumes) else: print( "Pyramid: No shapes entered" ) # If there are no pyramid volumes, print appropriate message # If there are calculated ellipsoid volumes, print them. if len(ellipsoidVolumes) != 0: print("Ellipsoid: ", ellipsoidVolumes) else: print( "Ellipsoid: No shapes entered" ) # If there are no ellipsoid volumes, print appropriate message # Within the 'summarize' module, send all the lists of volumes and test case number to the "summarize" method. # This will print them to a text file with the appropriate test case number. summarize.summarize(cubeVolumes, pyramidVolumes, ellipsoidVolumes, testCase)