def main(): parser = argparse.ArgumentParser(description="Retrieve petitions from We The People") parser.add_argument("-m", "--max", metavar="INTEGER", dest="max", type=int, default=None, help="maximum pages of petitions to retrieve, default is 10, 100 per page") parser.add_argument("-s", "--start", metavar="INTEGER", dest="start", type=int, default=1, help="starting page, 100 per page, default is 1") parser.add_argument("-q", "--query", metavar="STRING", dest="query", type=str, default="whitehouse+petition", help="The query for searching twitter for petition links, default is 'whitehouse+petition'") args = parser.parse_args() if args.max is not None and args.max < 1: parser.error("How can I scrape less than one pages of twitter results? You make no sense! --max must be one or greater.") if args.start < 1: parser.error("--start must be one or greater.") if not len(sys.argv) > 1: log('Running with default values. Use --h to see options.') search(args.query, args.start, args.max) #write log scrapelog["query"] = args.query scrapelog["end"] = datetime.now().strftime("%Y-%m-%d-%H:%M:%S") write(json.dumps(scrapelog, indent=2), "log-tw-" + scrapelog["begin"] + ".json", log_dir()) log("Done. Found total %i petitions" % (len(scrapelog["signatures"])))
def process(inFile, outFile, targets, algo): capture = cv2.VideoCapture(inFile) retval, image = capture.read() locations = [] if retval: writer = cv2.VideoWriter(outFile + ".avi", fps=25, fourcc=cv2.cv.CV_FOURCC(*"DIVX"), frameSize=image.shape[0:2][::-1]) algorithms = [] for x in targets: algo.start(image, x) algorithms.append(algo) utils.drawTarget(image, algo.target) writer.write(image) w,h = image.shape[:2] while retval: retval, image = capture.read() target = np.array(algo.target) / np.array([h, w, h, w], dtype=np.float32) locations.append(target) if retval: for algo in algorithms: algo.next(image) color = (255, 0, 0) if algo.valid: color = (0, 255, 0) utils.drawTarget(image, algo.target, color) writer.write(image) utils.write(outFile + ".txt", inFile, locations)
def do(self): log_msg = 'Tagging: "%s" as "%s"' % (self._revision, self._name) opts = {} if self._message: opts['F'] = utils.tmp_filename('tag-message') utils.write(opts['F'], self._message) if self._sign: log_msg += ', GPG-signed' opts['s'] = True status, output = self.model.git.tag(self._name, self._revision, with_status=True, with_stderr=True, **opts) else: opts['a'] = bool(self._message) status, output = self.model.git.tag(self._name, self._revision, with_status=True, with_stderr=True, **opts) if 'F' in opts: os.unlink(opts['F']) if output: log_msg += '\nOutput:\n%s' % output _notifier.broadcast(signals.log_cmd, status, log_msg) if status == 0: self.model.update_status()
def panel(self) : """ Set up the side panel """ self.disp.blit(IMG_SIDEPANEL_BG, (16*SQUARE, 16, 450, 496)) self.btn_traps = [] x = 16*SQUARE+65 y = 20 for i in TRAPS : self.disp.blit(i[0], (x,y)) name = utils.write(i[1], BLACK) price = utils.write(str(i[3]), GRAY) lines = utils.formattext(i[2], 35, BLACK, 15) self.disp.blit(name, (x+40,y+2)) self.disp.blit(price, (x+275, y+2)) i = 20 for l in lines : self.disp.blit(l, (x+40, y+i)) i += 15 self.btn_traps.append( (x, y, 330, i+5) ) y += 75 self.disp.blit(IMG_LEVEL, RECT_LEVEL) self.disp.blit(IMG_MONEY, RECT_MONEY) self.disp.blit(IMG_LAB_QUIT, BTN_LAB_QUIT) self.disp.blit(IMG_LAB_START, BTN_LAB_START) self.updatepanel()
def main(): parser = argparse.ArgumentParser(description="Retrieve petitions from We The People") parser.add_argument( "-m", "--max", metavar="INTEGER", dest="max", type=int, default=None, help="maximum number of petitions to retrieve", ) parser.add_argument( "-s", "--start", metavar="INTEGER", dest="start", type=int, default=1, help="starting page, 20 per page, default is 1", ) args = parser.parse_args() if args.max is not None and args.max < 1: parser.error("How can I scrape less than one petition? You make no sense! --max must be one or greater.") if args.start < 1: parser.error("--start must be one or greater.") log("Found %i petitions" % (petitions(args.start, args.max))) # write log scrapelog["end"] = datetime.now().strftime("%Y-%m-%d-%H:%M:%S") write(json.dumps(scrapelog, indent=2), "log-wh-" + scrapelog["begin"] + ".json", log_dir())
def split_signatures(pid, signatures=None): if not signatures: signatures = json.load(open(os.getcwd() + "/data/api/signatures/" + pid + ".json", "r")) for signature in signatures: signature['date'] = datetime.datetime.fromtimestamp(signature['created']).strftime("%y-%m-%d") signature['time'] = datetime.datetime.fromtimestamp(signature['created']).strftime("%H:%M:%S") #rm this needless field if signature['type'] == "signature": signature.pop("type") dates = sorted(set(map(lambda x:x['date'], signatures))) mostrecent = max([x['created'] for x in signatures]) stats = { 'total': len(signatures), 'dates': [], 'last': datetime.datetime.fromtimestamp(mostrecent).strftime("%y-%m-%d"), 'laststamp': mostrecent } for day in dates: sigs = [x for x in signatures if x['date'] == day] stats['dates'].append((day, len(sigs))) write(json.dumps(sigs), "api/signatures/" + pid + "/" + day + ".json") write(json.dumps(stats, indent=2), "api/signatures/" + pid + "/stats.json")
def combine(): roster = defaultdict(list) total = [defaultdict(int) for x in range(segments)] starts = {} data = json.load(open("data/times/all.json", "r")) duds = 0 co = 0 for runner in data: #print runner["bib number"], runner["5K"] #see if he/she showed up if "5K" not in runner or not runner["5K"][1]: duds += 1 continue co += 1 if co % 100 == 0: print co #placement will represent which marker he/she was closest to at each interval placement = ["0" for x in range(segments)] #stamps is the timestamps scraped from BAA.org stamps = [runner[x][1] for x in posts] marker = 0 #fill in placement with most recent split time (intervals of 5K + half and finish) for c in range(segments): if c > 0: placement[c] = placement[c - 1] if marker < len(posts) and stamps[marker] and stamps[marker] < c * INTERVAL: placement[c] = posts[marker] marker += 1 placement = [int(x.replace("K", "").replace("Finish Net", "42").replace("HALF", "21")) for x in placement] #print placement #print runner["bib number"] #calculate interpolations between kilometer marks #start at appropriate place for offset in starting point c = int(round(runner["0K"] / INTERVAL)) while c < len(placement): if placement[c] == placement[-1] or c >= len(placement) - 2: break t = 1 while c+t < len(placement) and placement[c + t] == placement[c]: t += 1 #print c, t, placement[c+t], placement[c] step = float(placement[c+t]-placement[c]) / t for i in range(1, t): placement[c + i] = int(math.floor(placement[c + i] + i * step)) c += t #print placement key = "_".join([str(x) for x in placement]) roster[key].append(runner["bib number"]) for c in range(segments): total[c][placement[c]] += 1 write(json.dumps(roster, indent=2), "times/condensed.json") write(json.dumps(total, indent=2), "times/condensed_time.json")
def update_bill_version_list(only_congress): bill_versions = {} # Which sitemap years should we look at? if not only_congress: sitemap_files = glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/BILLS.xml") else: # If --congress=X is specified, only look at the relevant years. sitemap_files = [ utils.cache_dir() + "/fdsys/sitemap/" + str(year) + "/BILLS.xml" for year in utils.get_congress_years(only_congress) ] sitemap_files = [f for f in sitemap_files if os.path.exists(f)] # For each year-by-year BILLS sitemap... for year_sitemap in sitemap_files: dom = etree.parse(year_sitemap).getroot() if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.") # Loop through each bill text version... for file_node in dom.xpath("x:url", namespaces=ns): # get URL and last modified date url = str(file_node.xpath("string(x:loc)", namespaces=ns)) lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns)) # extract bill congress, type, number, and version from the URL m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url) if not m: raise Exception("Unmatched bill document URL: " + url) congress, bill_type, bill_number, version_code = m.groups() congress = int(congress) if bill_type not in utils.thomas_types: raise Exception("Invalid bill type: " + url) # If --congress=XXX is specified, only look at those bills. if only_congress and congress != only_congress: continue # Track the documents by congress, bill type, etc. bill_versions.setdefault(congress, {}).setdefault(bill_type, {}).setdefault(bill_number, {})[ version_code ] = {"url": url, "lastmod": lastmod} # Output the bill version info. We can't do this until the end because we need to get # the complete list of versions for a bill before we write the file, and the versions # may be split across multiple sitemap files. for congress in bill_versions: for bill_type in bill_versions[congress]: for bill_number in bill_versions[congress][bill_type]: utils.write( json.dumps( bill_versions[congress][bill_type][bill_number], sort_keys=True, indent=2, default=utils.format_datetime, ), output_for_bill(congress, bill_type, bill_number, "text-versions.json"), )
def fetch_votes(session, rootdir): #get list of all votes from session from GovTrack votes = parse("http://www.govtrack.us/data/us/%s/rolls/" % session) for vote in [x for x in votes.xpath("//a/@href") if x[-4:] == ".xml"]: chamber = "house" if vote[0] == 'h' else "senate" url = "http://www.govtrack.us/data/us/%s/rolls/%s" % (session, vote) doc = download(url, session + "/" + vote) doc = doc.replace("&", "&") try: markup = lxml.objectify.fromstring(doc) except Exception, e: print "Couldn't read", url print e continue data = {} data["rollcall"] = {} #walk through xml and collect key/value pairs for el in markup.getiterator(): if el.attrib == {}: data[el.tag] = el.text elif el.tag == 'voter': data["rollcall"][el.attrib["id"]] = el.attrib["value"] print rootdir + "/data/json/%s/%s/%s.json" % (chamber, session, vote[:-4]) write(json.dumps(data, indent=2), rootdir + "/data/json/%s/%s/%s.json" % (chamber, session, vote[:-4]))
def run(options): # Load the committee metadata from the congress-legislators repository and make a # mapping from thomas_id and house_id to the committee dict. For each committee, # replace the subcommittees list with a dict from thomas_id to the subcommittee. utils.require_congress_legislators_repo() committees = { } for c in utils.yaml_load("congress-legislators/committees-current.yaml"): committees[c["thomas_id"]] = c if "house_committee_id" in c: committees[c["house_committee_id"] + "00"] = c c["subcommittees"] = dict((s["thomas_id"], s) for s in c.get("subcommittees", [])) for chamber in ("house", "senate"): # Load any existing meetings file so we can recycle GUIDs generated for Senate meetings. existing_meetings = [] output_file = utils.data_dir() + "/committee_meetings_%s.json" % chamber if os.path.exists(output_file): existing_meetings = json.load(open(output_file)) # Scrape for meeting info. if chamber == "senate": meetings = fetch_senate_committee_meetings(existing_meetings, committees, options) else: meetings = fetch_house_committee_meetings(existing_meetings, committees, options) # Write out. utils.write(json.dumps(meetings, sort_keys=True, indent=2, default=utils.format_datetime), output_file)
def process_bill(bill_id, options): fdsys_xml_path = _path_to_billstatus_file(bill_id) logging.info("[%s] Processing %s..." % (bill_id, fdsys_xml_path)) # Read FDSys bulk data file. xml_as_dict = read_fdsys_bulk_bill_status_file(fdsys_xml_path, bill_id) bill_data = form_bill_json_dict(xml_as_dict) # Convert and write out data.json and data.xml. utils.write( unicode(json.dumps(bill_data, indent=2, sort_keys=True)), os.path.dirname(fdsys_xml_path) + '/data.json') from bill_info import create_govtrack_xml with open(os.path.dirname(fdsys_xml_path) + '/data.xml', 'wb') as xml_file: xml_file.write(create_govtrack_xml(bill_data, options)) if options.get("amendments", True): process_amendments(bill_id, xml_as_dict, options) # Mark this bulk data file as processed by saving its lastmod # file under a new path. utils.write( utils.read(_path_to_billstatus_file(bill_id).replace(".xml", "-lastmod.txt")), os.path.join(os.path.dirname(fdsys_xml_path), "data-fromfdsys-lastmod.txt")) return { "ok": True, "saved": True, }
def write_report(report): data_path = "%s/%s/%s/report.json" % (report['inspector'], report['year'], report['report_id']) utils.write( utils.json_for(report), "%s/%s" % (utils.data_dir(), data_path) ) return data_path
def write_bill_version_metadata(bill_version_id): bill_type, number, congress, version_code = utils.split_bill_version_id(bill_version_id) bill_version = { 'bill_version_id': bill_version_id, 'version_code': version_code, 'urls': { }, } mods_ns = {"mods": "http://www.loc.gov/mods/v3"} doc = etree.parse(document_filename_for(bill_version_id, "mods.xml")) locations = doc.xpath("//mods:location/mods:url", namespaces=mods_ns) for location in locations: label = location.attrib['displayLabel'] if "HTML" in label: format = "html" elif "PDF" in label: format = "pdf" elif "XML" in label: format = "xml" else: format = "unknown" bill_version["urls"][format] = location.text bill_version["issued_on"] = doc.xpath("string(//mods:dateIssued)", namespaces=mods_ns) utils.write( json.dumps(bill_version, sort_keys=True, indent=2, default=utils.format_datetime), output_for_bill_version(bill_version_id) ) return {'ok': True, 'saved': True}
def write_bill_catoxml(bill_version_id, options): utils.write( extract_xml_from_json(fetch_single_bill_json(bill_version_id)), document_filename_for(bill_version_id, "catoxml.xml") ) return {'ok': True, 'saved': True}
def trade_reciprocity(years,resource): corrmeans = [] for year in years: G = get_graph(year,resource) corrcoeffs = [] [xs,ys] = [[],[]] for country in G.nodes(): for e in G.edges(country): try: [x1,y1] = [G[e[0]][e[1]],G[e[1]][e[0]]] #print [x1,y1] xs.append(x1['weight']) ys.append(y1['weight']) except KeyError: 'whoops' if len(xs)>1: cc = np.corrcoef([xs,ys]) corrcoeffs.append(cc[0][1]) #print corrcoeffs corrmeans.append(np.mean(corrcoeffs)) print [year,np.mean(corrcoeffs)] write({'means':corrmeans, 'years':years},get_results_directory(resource),'meanReciprocityCorrelation') plt.clf() plt.plot(years,corrmeans) plt.title('Mean Correlation of Import/Export By Year') plt.xlabel('Year') plt.ylabel('Mean Correlation of Import/Export') directory = get_images_directory(resource) plt.savefig(directory+'meanReciprocityCorrelation.png') plt.clf() return 0
def fetch_version(bill_version_id, options): logging.info("\n[%s] Fetching..." % bill_version_id) bill_type, number, congress, version_code = utils.split_bill_version_id(bill_version_id) # bill_id = "%s%s-%s" % (bill_type, number, congress) mods_filename = filename_for(bill_version_id) mods_cache = version_cache_for(bill_version_id, "mods.xml") issued_on, urls = fdsys.document_info_for(mods_filename, mods_cache, options) bill_version = { 'issued_on': issued_on, 'urls': urls, 'version_code': version_code, 'bill_version_id': bill_version_id } # 'bill_version_id': bill_version_id, # 'version_code': version_code utils.write( json.dumps(bill_version, sort_keys=True, indent=2, default=utils.format_datetime), output_for_bill_version(bill_version_id) ) return {'ok': True, 'saved': True}
def output_nomination(nomination, options): logging.info("[%s] Writing to disk..." % nomination['nomination_id']) # output JSON - so easy! utils.write( json.dumps(nomination, sort_keys=True, indent=2, default=utils.format_datetime), output_for_nomination(nomination['nomination_id'], "json") )
def __write_templates(self, project_name, dir_name): """ Generate upstart and startup for project """ upstart = self.__generate_project_template(project_name,'upstart_template') startup = self.__generate_project_template(project_name, 'startup_template') write('%s/upstart.conf' % dir_name, upstart) write('%s/startup.sh' % dir_name, startup)
def get_sitemap(year, collection, lastmod, options): """Gets a single sitemap, downloading it if the sitemap has changed. Downloads the root sitemap (year==None, collection==None), or the sitemap for a year (collection==None), or the sitemap for a particular year and collection. Pass lastmod which is the current modification time of the file according to its parent sitemap, which is how it knows to return a cached copy. Returns the sitemap parsed into a DOM. """ # Construct the URL and the path to where to cache the file on disk. if year == None: url = "http://www.gpo.gov/smap/fdsys/sitemap.xml" path = "fdsys/sitemap/sitemap.xml" elif collection == None: url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/sitemap_%s.xml" % (year, year) path = "fdsys/sitemap/%s/sitemap.xml" % year else: url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/%s_%s_sitemap.xml" % (year, year, collection) path = "fdsys/sitemap/%s/%s.xml" % (year, collection) # Should we re-download the file? lastmod_cache_file = utils.cache_dir() + "/" + path.replace(".xml", "-lastmod.txt") if options.get("cached", False): # If --cached is used, don't hit the network. force = False elif not lastmod: # No *current* lastmod date is known for this file (because it is the master # sitemap file, probably), so always download. force = True else: # If the file is out of date or --force is used, download the file. cache_lastmod = utils.read(lastmod_cache_file) force = (lastmod != cache_lastmod) or options.get("force", False) if force: logging.warn("Downloading: %s" % url) body = utils.download(url, path, utils.merge(options, { 'force': force, 'binary': True })) if not body: raise Exception("Failed to download %s" % url) # Write the current last modified date to disk so we know the next time whether # we need to fetch the file. if lastmod and not options.get("cached", False): utils.write(lastmod, lastmod_cache_file) try: return etree.fromstring(body) except etree.XMLSyntaxError as e: raise Exception("XML syntax error in %s: %s" % (url, str(e)))
def write_bill_catoxml(bill_version_id, options): catoxml_filename = catoxml_filename_for(bill_version_id) utils.write( extract_xml_from_json(fetch_single_bill_json(bill_version_id)), catoxml_filename ) return {"ok": True, "saved": True}
def save_bill_search_state(saved_bills, search_state): # For --fast mode, cache the current search result listing (in search_state) # to disk so we can detect major changes to the bill through the search # listing rather than having to parse the bill. for bill_id in saved_bills: if bill_id in search_state: fast_cache_path = utils.cache_dir() + "/" + bill_info.bill_cache_for(bill_id, "search_result.html") new_state = search_state[bill_id] utils.write(new_state, fast_cache_path)
def save_meta_result(result): path = meta_path_for(result['type'], result['agency'], result['year'], result['id']) # for paged metadata, don't overwrite if we've got it already, # we don't keep anything that should change. if os.path.exists(path): logging.debug("[%s][%s] Knew about it, skipping." % (result['id'], result['type'])) else: logging.warn("[%s][%s] Newly discovered, saving metadata." % (result['id'], result['type'])) utils.write(utils.json_for(result), path)
def Write(target, source, env): # we don't use target and source as usual : we may apply several times this # builder on the same source/target (or the source may be the target), # that's not possible for scons files, contents = WriteArgs(target, source, env) for f, c in zip(files, contents): utils.write(c, f) return None
def save(url, nzb_path): file, e = _load_nzb(url) if e is None: try: utils.write(nzb_path, file, 'wb') except: e = "Pneumatic failed writing %s" % nzb_path else: e = "Pneumatic saved %s" % nzb_path print e return
def mirror_file(year, collection, package_name, lastmod, granule_name, file_types, options): # Where should we store the file? path = get_output_path(year, collection, package_name, granule_name, options) if not path: return # should skip # Do we need to update this record? lastmod_cache_file = path + "/lastmod.txt" cache_lastmod = utils.read(lastmod_cache_file) force = ((lastmod != cache_lastmod) or options.get("force", False)) and not options.get("cached", False) # Try downloading files for each file type. targets = get_package_files(package_name, granule_name, path) updated_file_types = set() for file_type in file_types: if file_type not in targets: raise Exception("Invalid file type: %s" % file_type) f_url, f_path = targets[file_type] if (not force) and os.path.exists(f_path): continue # we already have the current file logging.warn("Downloading: " + f_path) data = utils.download(f_url, f_path, utils.merge(options, { 'binary': True, 'force': force, 'to_cache': False, 'needs_content': file_type == "text" and f_path.endswith(".html"), })) updated_file_types.add(file_type) if not data: if file_type == "pdf": # expected to be present for all packages raise Exception("Failed to download %s" % package_name) else: # not all packages have all file types, but assume this is OK logging.error("file not found: " + f_url) continue if file_type == "text" and f_path.endswith(".html"): # The "text" format files are put in an HTML container. Unwrap it into a .txt file. # TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it? # html.fromstring does auto-detection. with open(f_path[0:-4] + "txt", "w") as f: text_content = unicode(html.fromstring(data).text_content()) f.write(text_content.encode("utf8")) if collection == "BILLS" and "mods" in updated_file_types: # When we download bill files, also create the text-versions/data.json file # which extracts commonly used components of the MODS XML. from bill_versions import write_bill_version_metadata write_bill_version_metadata(get_bill_id_for_package(package_name, with_version=True)) # Write the current last modified date to disk so we know the next time whether # we need to fetch the files for this sitemap item. if lastmod and not options.get("cached", False): utils.write(lastmod, lastmod_cache_file)
def get_edges(): data = json.load(open(os.getcwd() + "/data/api/reports/by_petition.json", 'r')) keys = sorted(data.keys()) edges ={} for (x,y) in combinations(keys, 2): if (x > y): x,y = y,x edges[x + "_" + y] = set(data[x]).intersection(set(data[y])) write(json.dumps(data, indent=2), "api/reports/edges.json")
def run(options): for_the_week = get_monday_week(options.get('for_the_week', None)) #yyyymmdd logging.info('Scraping upcoming bills from docs.house.gov/floor for the week %s.' % for_the_week) # Parse the content into upcoming_bills upcoming_bills = fetch_bills_week(for_the_week, options) # Write the json to data folder output_file = utils.data_dir() + "/upcoming_bills_%s.json" % for_the_week utils.write(json.dumps(upcoming_bills, sort_keys=True, indent=2, default=utils.format_datetime), output_file)
def run(options): # accepts yyyymmdd format for_the_week = get_monday_of_week(options.get('week_of', None)) logging.warn('Scraping upcoming bills from docs.house.gov/floor for the week of %s.\n' % for_the_week) house_floor = fetch_floor_week(for_the_week, options) output_file = "%s/upcoming_house_floor/%s.json" % (utils.data_dir(), for_the_week) output = json.dumps(house_floor, sort_keys=True, indent=2, default=utils.format_datetime) utils.write(output, output_file) logging.warn("\nFound %i bills for the week of %s, written to %s" % (len(house_floor['upcoming_bills']), for_the_week, output_file))
def combine(options): alld = {} files = [x for x in os.listdir(os.getcwd() + "/data/") if re.sub("\d+\.json", "", x) == ""] ignore = options.get("ignore", []) for file in files: sermon = json.load(open(os.getcwd() + "/data/" + file, 'r')) for i in ignore: if i in sermon: sermon.pop(i) alld[sermon['uid']] = sermon write(json.dumps(alld, indent=2, sort_keys=True), os.getcwd() + "/data/all.json") write(json.dumps(alld, sort_keys=True), os.getcwd() + "/data/all.min.json")
def run_for_week(for_the_week, options): logging.info('Scraping upcoming bills from docs.house.gov/floor for the week of %s...' % for_the_week) house_floor = fetch_floor_week(for_the_week, options) if house_floor is None: logging.warn("Nothing posted for the week of %s" % for_the_week) return output_file = "%s/upcoming_house_floor/%s.json" % (utils.data_dir(), for_the_week) output = json.dumps(house_floor, sort_keys=True, indent=2, default=utils.format_datetime) utils.write(output, output_file) logging.warn("Found %i bills for the week of %s, written to %s" % (len(house_floor['upcoming']), for_the_week, output_file))
def main(): data = read('data/data.json') tweets = read('data/tweets.json') tf = {} idf = {} inverted_index = {} thread_count = sum(len(rumour) for rumour in data.values()) for rumour_name, rumour in data.items(): for thread_id, thread in rumour.items(): tweets = [thread['source']] + list( thread.get('replies', dict()).values()) word_index = {} word_count = 0 for tweet in tweets: text = tweet['text'] words = [word.lower() for word in re.findall("[\w#@']+", text)] for word in words: word_index[word] = word_index.get(word, 0) + 1 word_count += len(words) for word, count in word_index.items(): tf[word] = tf.get(word, dict()) tf[word][thread_id] = count / float(word_count) inverted_index[word] = inverted_index.get(word, dict()) inverted_index[word][thread_id] = 1 for word, thread_dict in inverted_index.items(): idf[word] = math.log(thread_count / float(len(thread_dict))) write('data/tfidf.json', { "tf": tf, "idf": idf, "inverted_index": inverted_index })
def train(self, batches): loss_total = 0. crr = 0. true_total = 0. pred_total = 0. start = time.time() batch_indices = range(len(batches)) np.random.shuffle(batch_indices) self.model.feat_layer.is_train.set_value(1) for index, b_index in enumerate(batch_indices): if (index + 1) % 100 == 0: print '%d' % (index + 1), sys.stdout.flush() batch = batches[b_index] loss_i, crr_i, true_i, pred_i = self.train_func(*batch) loss_total += loss_i crr += crr_i true_total += true_i pred_total += pred_i avg_loss = loss_total / float(len(batches)) precision = crr / pred_total if pred_total > 0 else 0. recall = crr / true_total if true_total > 0 else 0. f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0. write('\n\tTime: %f seconds' % (time.time() - start)) write('\tAverage Negative Log Likelihood: %f' % avg_loss) write('\tLabel: F1:%f\tP:%f(%d/%d)\tR:%f(%d/%d)' % (f1, precision, crr, pred_total, recall, crr, true_total))
def create_preload_list(): preload_json = None if PRELOAD_CACHE and os.path.exists(PRELOAD_CACHE): logging.debug("Using cached Chrome preload list.") preload_json = json.loads(open(PRELOAD_CACHE).read()) else: logging.debug("Fetching Chrome preload list from source...") # Downloads the chromium preloaded domain list and sets it to a global set file_url = 'https://chromium.googlesource.com/chromium/src/net/+/master/http/transport_security_state_static.json?format=TEXT' # TODO: proper try/except around this network request request = requests.get(file_url) raw = request.content # To avoid parsing the contents of the file out of the source tree viewer's # HTML, we download it as a raw file. googlesource.com Base64-encodes the # file to avoid potential content injection issues, so we need to decode it # before using it. https://code.google.com/p/gitiles/issues/detail?id=7 raw = base64.b64decode(raw).decode('utf-8') # The .json file contains '//' comments, which are not actually valid JSON, # and confuse Python's JSON decoder. Begone, foul comments! raw = ''.join( [re.sub(r'^\s*//.*$', '', line) for line in raw.splitlines()]) preload_json = json.loads(raw) if PRELOAD_CACHE: logging.debug("Caching preload list at %s" % PRELOAD_CACHE) utils.write(utils.json_for(preload_json), PRELOAD_CACHE) # For our purposes, we only care about entries that includeSubDomains fully_preloaded = [] for entry in preload_json['entries']: if entry.get('include_subdomains', False) is True: fully_preloaded.append(entry['name']) return fully_preloaded
def divide_train_dev(tweets): train_categories = read(TRAIN) dev_categories = read(DEV) train = [] dev = [] for tweet in tweets: if tweet.get('reply_to'): el = { 'text': tweet['text'], 'reply_to': tweet['reply_to'] } if tweet['id'] in train_categories: el['group'] = train_categories[tweet['id']] train += [el] # train += [{ # 'text': tweet['text'], # 'reply_to': tweet['reply_to'], # 'group': train_categories[tweet['id']] # }] else: el['group'] = dev_categories[tweet['id']] dev += [el] # dev += [{ # 'text': tweet['text'], # 'reply_to': tweet['reply_to'], # 'group': dev_categories[tweet['id']] # }] # all += [el] write('data/train.json', train) write('data/dev.json', dev) write('data/groups.json', dict(train_categories.items() | dev_categories.items()))
def predict(self, batches): """ :param batches: 1D: n_batches, 2D: n_words; elem=(x_w, x_m) :return: y: 1D: n_batches, 2D: batch_size; elem=(y_pred(1D:n_words), y_proba(float)) """ start = time.time() y = [] self.model.feat_layer.is_train.set_value(0) for index, inputs in enumerate(batches): if (index + 1) % 1000 == 0: print '%d' % (index + 1), sys.stdout.flush() if len(inputs) == 0: y_pred = [] else: y_pred = self.pred_func(*inputs) y.append(y_pred) write('\n\tTime: %f seconds' % (time.time() - start)) return y
def main(): args = docopt.docopt(__doc__, version='v0.0.1') utils.configure_logging(args['--debug']) out_file = args['--output'] # Read from a .csv, or allow domains on the command line. domains = [] if args['INPUT'][0].endswith(".csv"): domains = utils.load_domains(args['INPUT'][0]) else: domains = args['INPUT'] # If the user wants to sort them, sort them in place. if args['--sorted']: domains.sort() options = { 'user_agent': args['--user-agent'], 'timeout': args['--timeout'], 'preload_cache': args['--preload-cache'], 'cache': args['--cache'] } results = pshtt.inspect_domains(domains, options) # JSON can go to STDOUT, or to a file. if args['--json']: output = utils.json_for(results) if out_file is None: print(output) else: utils.write(output, out_file) logging.warn("Wrote results to %s." % out_file) # CSV always goes to a file. else: if args['--output'] is None: out_file = 'results.csv' pshtt.csv_for(results, out_file) logging.warn("Wrote results to %s." % out_file)
def report_rates_on_epoch(self, label: str, epno: int, batch_results: BatchResult, report_params: ReportParameters) -> None: report_str = 'Total #preds: {}\n'.format(batch_results.total_weight) true_pred = batch_results.weighted_true_preds false_miss = batch_results.weighted_n_labs - batch_results.weighted_true_preds false_pred = batch_results.weighted_n_preds - batch_results.weighted_true_preds # true_miss = (batch_results.total_weight - batch_results.weighted_n_labs) - false_pred report_for_i = lambda i: list( map(lambda x: x[i], (report_params.label_name_map, true_pred, false_miss, false_pred))) report_str += tabulate( list(map(report_for_i, range(report_params.top_k))), headers=['Label', '#Correct', '#Missed', '#Falsely Predicted']) report_str += '\n' utils.write( report_str, os.path.join('reports', utils.get_time_str(), 'epoch_{}_{}'.format(epno, label)))
def _prepare(language): wikipron = load_wikipron(language) # Preprocess preprocessor = Preprocessor(language) wikipron["preprocessed"] = wikipron["phonemes"].apply(preprocessor.preprocess) # Filter wikipron = wikipron[ wikipron["preprocessed"].str.split(" ").apply(len) > 3 ] # at least two phonemes # Split train, valid = train_test_split( wikipron, train_size=0.9, random_state=SEED, shuffle=True ) # Order train train = train.sample(frac=1) train["length"] = train["preprocessed"].str.len() train.sort_values(by=["frequency", "length"], ascending=(False, True), inplace=True) # Write filename = f"data/phoneme/wikipron/{language}/train.txt" write(train["preprocessed"], filename) filename = f"data/phoneme/wikipron/{language}/validation.txt" write(valid["preprocessed"], filename)
def main(): configs = utils.loadConfigs() # create and initialize analysis object analyzer = analysis.Analysis("USDT_BTC", 300, 500, configs) analyzer.setup() # create and initialize transactor (intially holding btc) trans_que = queue.Queue() trans = transactor.TransactorThread("USDT_BTC", trans_que, True, configs) trans.start() utils.sendMsg("trader started") utils.write("trader started") while True: if (analyzer.update()): val = analyzer.analyze() trans_que.put(val) time.sleep(60) trans.join()
def index(options = {}): concordance = defaultdict(list) files = [x for x in os.listdir(os.getcwd() + "/data/") if re.sub("\d+\.json", "", x) == ""] if options.get('limit', False): files = files[:options.get('limit')] for file in files: sermon = json.load(open(os.getcwd() + "/data/" + file, 'r')) words = uniq(re.findall(r"\b[A-z]+\b", sermon['opening'].replace('\n', ' ').lower())) ''' if options.get("uniques", False): words = uniq(re.findall(r"\b[A-z]+\b", sermon['opening'].replace('\n', ' ').lower())) else: words = re.findall(r"\b[A-z]+\b", sermon['opening'].replace('\n', ' ').lower()) ''' for word in words: if len(word) > 2: concordance[word].append(file.replace('.json', '')) write(json.dumps(concordance, sort_keys=True, indent=2), os.getcwd() + "/src/data/index.json") write(json.dumps(concordance, sort_keys=True), os.getcwd() + "/src/data/index.min.json")
def get_petitions(mx=-1, offset=0): limit = 100 stop = False petitions = [] while not stop: data = fetch_petitions(offset, limit) if "results" not in data or len(data["results"]) == 0: stop = True continue petitions += data["results"] if mx > -1 and len(petitions) > mx: petitions = petitions[:mx] stop = True offset += limit for petition in petitions: write(json.dumps(petition, indent=2), "api/petitions/" + petition['id'] + ".json") return petitions
def mirror_bulkdata_file(sitemap, url, item_path, lastmod, options): # Where should we store the file? path = "%s/fdsys/%s/%s" % (utils.data_dir(), sitemap["collection"], item_path) # For BILLSTATUS, store this along with where we store the rest of bill # status data. if sitemap["collection"] == "BILLSTATUS": from bills import output_for_bill bill_id, version_code = get_bill_id_for_package(os.path.splitext(os.path.basename(item_path))[0], with_version=False) path = output_for_bill(bill_id, FDSYS_BILLSTATUS_FILENAME, is_data_dot=False) # Where should we store the lastmod found in the sitemap so that # we can tell later if the file has changed? lastmod_cache_file = os.path.splitext(path)[0] + "-lastmod.txt" # Do we already have this file up to date? if os.path.exists(lastmod_cache_file) and not options.get("force", False): if lastmod == utils.read(lastmod_cache_file): return # With --cached, skip if the file is already downloaded. if os.path.exists(path) and options.get("cached", False): return # Download. logging.warn("Downloading: " + path) data = utils.download(url, path, utils.merge(options, { 'binary': True, 'force': True, # decision to cache was made above 'to_cache': False, })) if not data: # Something failed. return # Write the current last modified date back to disk so we know the next time whether # we need to fetch the file again. utils.write(lastmod, lastmod_cache_file)
def set_train_func(self): write('\nBuilding an lp train func...') y_label = T.imatrix('y') label_proba = self.model.calc_label_proba(self.model.inputs) label_pred = self.model.argmax_label_proba(label_proba) true_label_path_score = self.model.calc_label_path_score(label_proba, y_label) cost = - T.mean(true_label_path_score) + L2Regularizer()(alpha=self.argv.reg, params=self.model.params) grads = T.grad(cost=cost, wrt=self.model.params) self.optimizer = get_optimizer(argv=self.argv) updates = self.optimizer(grads=grads, params=self.model.params) self.train_func = theano.function( inputs=self.model.inputs + [y_label], outputs=[cost, categorical_accuracy(y_true=y_label, y_pred=label_pred), label_pred.flatten(), y_label.flatten() ], updates=updates, mode='FAST_RUN' )
def main(): data = read(REPLIES) negative = [line.rstrip('\n') for line in open(NEGATIVE_LEXICON)] positive = [line.rstrip('\n') for line in open(POSITIVE_LEXICON)] result = {} for tweet in data: if "reply_to" in tweet: tweet_id = tweet['id'] text = tweet['text'] vector = list() in_reply_to = tweet['reply_to'] vector.append(contains_original(text, in_reply_to)) vector.append(opinion_words_count(text, positive)) vector.append(opinion_words_count(text, negative)) if "tags" in tweet: tags = tweet['tags'] vector += reversed_word_order(tags) vector.append(contains_question_mark(text)) result[tweet_id] = { 'rumour': tweet['rumour'], 'vector': vector } write(OUTFILE, result)
def createSQLite3DB(filename, schemapath, override): cmd = "sqlite3 {} < {}".format(filename, os.path.join(schemapath, "sqlite3.sql")) write("+ Create SQLite3 database at '{}'".format(filename)) if os.path.exists(filename) and not override: print("+ Database file '{}' exists".format(filename)) return False path = os.path.dirname(filename) if not os.path.exists(path): try: os.makedirs(path) except OSError as err: print("+ Could not create directory '{}': {} ".format(path, err)) return False res = execute(cmd) if res.error: print(" + {}".format(res.error)) return False return True
def gen_operators(out, autograd_functions): all_operators_declarations = [] all_operators_defines = [] for func in autograd_functions: if ("namespace" in func["declaration"]["method_of"] and func["declaration"]["inplace"] == False): declr = gen_operator_declaration(func) defn = gen_operator_define(func) all_operators_declarations.append( OPERATOR_DECLARATION.substitute(declr)) all_operators_defines.append(OPERATOR_DEFINE.substitute(defn)) else: ## TODO pass top_env = { "auto_operator_declarations": all_operators_declarations, "auto_operator_defines": all_operators_defines, } write(out, "express_operator.h", OPERATORS_H, top_env) write(out, "express_operator.cpp", OPERATORS_CPP, top_env)
def run(options): # accepts yyyymmdd format given_week = options.get('week_of', None) if given_week is None: for_the_week = get_latest_monday(options) else: for_the_week = get_monday_of_week(given_week) logging.warn( 'Scraping upcoming bills from docs.house.gov/floor for the week of %s.\n' % for_the_week) house_floor = fetch_floor_week(for_the_week, options) output_file = "%s/upcoming_house_floor/%s.json" % (utils.data_dir(), for_the_week) output = json.dumps(house_floor, sort_keys=True, indent=2, default=utils.format_datetime) utils.write(output, output_file) logging.warn("\nFound %i bills for the week of %s, written to %s" % (len(house_floor['upcoming']), for_the_week, output_file))
def extract_bill_version_metadata(package_name, text_path): bill_version_id = get_bill_id_for_package(package_name) bill_type, number, congress, version_code = utils.split_bill_version_id( bill_version_id) bill_version = { 'bill_version_id': bill_version_id, 'version_code': version_code, 'urls': {}, } mods_ns = {"mods": "http://www.loc.gov/mods/v3"} doc = etree.parse(os.path.join(text_path, "mods.xml")) locations = doc.xpath("//mods:location/mods:url", namespaces=mods_ns) for location in locations: label = location.attrib['displayLabel'] if "HTML" in label: format = "html" elif "PDF" in label: format = "pdf" elif "XML" in label: format = "xml" else: format = "unknown" bill_version["urls"][format] = location.text bill_version["issued_on"] = doc.xpath("string(//mods:dateIssued)", namespaces=mods_ns) utils.write( json.dumps(bill_version, sort_keys=True, indent=2, default=utils.format_datetime), output_for_bill_version(bill_version_id))
def gen_rpcs(out, autograd_functions): all_rpc_defines = [] all_rpc_binds = [] all_names = {} for func in autograd_functions: if ("namespace" in func["declaration"]["method_of"] and func["declaration"]["inplace"] == False): name = func["declaration"]["api_name"] if (name not in all_names): all_names[name] = 0 else: all_names[name] = all_names[name] + 1 name = name + str(all_names[name]) bind = gen_rpc_bind(func) bind["api_name"] = name all_rpc_binds.append(RPC_BIND.substitute(bind)) define = gen_rpc_define(func) define["api_name"] = name all_rpc_defines.append(RPC_DEFINE.substitute(define)) elif ("Tensor" in func["declaration"]["method_of"] and func["declaration"]["inplace"] == True): ## TODO pass else: pass top_env = { "auto_rpc_binds": all_rpc_binds, "auto_rpc_defines": all_rpc_defines } write(out, "express_rpc.h", RPC_H, top_env) write(out, "express_rpc.cpp", RPC_CPP, top_env)
def _count_batches(self, train_samples, dev_samples): write('\n\tMaking Batches...') train_batches = self.preprocessor.make_batches(samples=train_samples) if dev_samples: dev_batches = self.preprocessor.make_batches(samples=dev_samples) else: dev_batches = [] write('\t- Train Batches: %d' % len(train_batches)) write('\t- Dev Batches: %d' % len(dev_batches))
def run(self): log = {} una, miss, cta = clt.find_missing(self.cfg.move_csv, self.cfg.video_csv, self.cfg.output_dir) log['missing'] = { 'unavailable': len(una), 'missing': len(miss), 'call_to_action': len(cta) } miss.to_csv(os.path.join(self.cfg.output_dir, 'missing.csv'), sep='\t') una, found = clt.collect(miss, self.cfg.video_dst, self.cfg.output_dir) log['collect'] = {'unavailable': len(una), 'found': len(found)} update_path = os.path.join(self.cfg.output_dir, 'updated.csv') updated, err = clt.update_videos(self.cfg.move_csv, self.cfg.video_csv, found, self.cfg.video_src, update_path) write('no_videos_clt.txt', err) write('collect_videos.json', log)
def gen_parts(p_size, msg, n, k): if n < k: print('ОШИБКА: число долей={} больше k={}'.format(n, k)) return with open(msg, 'rb') as f: msg = bytearray(f.read()) part_size = ceil(len(msg) / k) q = [ int.from_bytes(msg[i * part_size:(i + 1) * part_size], byteorder='big') for i in range(k) ] p_size_max = max(qi.bit_length() for qi in q) + 1 if p_size_max > p_size: p_size = p_size_max + 1 print('ЛОГ: длина модуля p изменена на {}'.format(p_size)) p = gen_prime(p_size) mat = gen_mat(p, n, k, q) write('p.txt', p) for idx, part in enumerate(mat): write('part_{}.txt'.format(idx + 1), part)
def main(args): print(args) index_file = "stem_{}_stop_{}_inverted_index.txt".format( args.isstemmed, args.isstopped) queries = utils.load_queries(utils.PARSED_QUERIES) if args.isstemmed: queries = utils.load_queries(utils.STEM_QUERIES) index = utils.load_inverted_index(os.path.join(utils.INDEX_DIR, index_file)) stats = utils.load_corpus_stats() obj = BM25(args, index, stats, queries[49:54]) obj.compute_scores() file_name = "stem_{}_stop_{}_bm25_score.csv".format( args.isstemmed, args.isstopped) file_path = os.path.join(utils.RESULT_DIR, "bm25", file_name) utils.write(obj.log, file_path, obj.bm25_scores, csvf=True) file_name2 = "stem_{}_stop_{}_bm25_score.json".format( args.isstemmed, args.isstopped) file_path2 = os.path.join(utils.RESULT_DIR, "bm25", file_name2) utils.write(obj.log, file_path2, obj.bm25_scores)
def make_show_list(main_data): """ Renders the show list page :param main_data: the main TemplateData instance :return: empty """ show_template_data = deepcopy(main_data) show_summaries = [] for year in get_defined_years(): for season in reversed(seasons): yaml_path = path.join(utils.root, 'site', year, season, 'show.yaml') if not path.isfile(yaml_path): continue show_data = load_or_die(yaml_path) graphic = get_show_graphic(year, season) is_current = year == current_year and season == current_season show_data.update({ 'year': year, 'season': season, 'graphic': graphic, 'is_current': is_current }) show_template_data.bind('show', show_data) show_summaries.append( compiled_summary_template.evaluate(show_template_data)) show_list_data = deepcopy(main_data) show_list_data.bind('show_list', show_summaries) write(show_list_data, 'MTG - Show List', compiled_show_list_template.evaluate(show_list_data), 'site', 'show_list.html')
def run(options): # Load the committee metadata from the congress-legislators repository and make a # mapping from thomas_id and house_id to the committee dict. For each committee, # replace the subcommittees list with a dict from thomas_id to the subcommittee. utils.require_congress_legislators_repo() committees = {} for c in utils.yaml_load( "cache/congress-legislators/committees-current.yaml"): committees[c["thomas_id"]] = c if "house_committee_id" in c: committees[c["house_committee_id"] + "00"] = c c["subcommittees"] = dict( (s["thomas_id"], s) for s in c.get("subcommittees", [])) for chamber in ("house", "senate"): # Load any existing meetings file so we can recycle GUIDs generated for Senate meetings. existing_meetings = [] output_file = utils.data_dir( ) + "/committee_meetings_%s.json" % chamber if os.path.exists(output_file): existing_meetings = json.load(open(output_file)) # Scrape for meeting info. if chamber == "senate": meetings = fetch_senate_committee_meetings(existing_meetings, committees, options) else: meetings = fetch_house_committee_meetings(existing_meetings, committees, options) # Write out. utils.write( json.dumps(meetings, sort_keys=True, indent=2, default=utils.format_datetime), output_file)
def lucene_result_parser(): result_set = defaultdict(list) result_path = os.path.join(utils.RESULT_DIR, "lucene_regular") files = os.listdir(result_path) for file in files: qid, ext = os.path.basename(file).split(".") file_path = os.path.join(result_path, file) if ext == "txt": with open(file_path, "r") as fp: content = fp.read().split("\n") for row in content: if len(row) > 0: doc_path, score = row.split() _, score = score.split("=") docid = os.path.basename(doc_path).split(".")[0] result_set[qid].append((docid, float(score))) result_path = os.path.join(utils.RESULT_DIR, "lucene", "stem_False_stop_False_lucene_score.csv") result_path2 = os.path.join(utils.RESULT_DIR, "lucene", "stem_False_stop_False_lucene_score.json") utils.write(None, result_path, result_set, csvf=True) utils.write(None, result_path2, result_set)
def _run(sim_name): local_only = [None] * len(n_sims) for n, i in enumerate(n_sims): print(i) d = info.copy() d['n_sim'] = i d['simulator'] = sim_name # get name for CPU/GPU # for backwards comparability, im adding cpu name into gpu col if sim_name == 'cl_amd_gpu': gpu_name = 'Radeon Vii' elif sim_name in ('cl_nvidia', 'cuda'): try: from pycuda.driver import Device gpu_name = ''.join([ i for i in Device(int(gpu_id)).name().split(' ') if i != 'GeForce' ]) except: gpu_name = 'RTX2080' else: gpu_name = get_info()['cpu_name'] d['gpu_name'] = gpu_name d['total_time'], d['sim_time'] = run(i, model, tspan, sim_name) write(d) local_only[n] = d tmp_pd = pd.DataFrame(local_only) print(tmp_pd[['n_sim', 'sim_time']]) out_name = os.path.join( cur_dir, 'Timings', '{}_{}_{}_timing.csv'.format(computer_name, sim_name, model.name))
def moving_avgs(deq): """ Takes in a deque of 500 data points Returns either Buy, Sell, or Hold """ dat = pd.DataFrame(list(deq)) d = dat['weightedAverage'].astype(float) deque_length = len(d) lma = np.mean(d) sma_start = int(4 * deque_length / 5) sma = np.mean(d[sma_start:]) date = datetime.datetime.fromtimestamp(int( dat.iloc[len(dat) - 1]['date'])).strftime('%Y-%m-%d %H:%M:%S') utils.write(date) utils.write(str(lma) + "," + str(sma)) if (sma - lma) / lma > 0.01: return 1 elif (sma - lma) / lma < -0.01: return -1 else: return 0
async def cog_load(ctx: Context, *, cog_names: str): loaded_cog_names = {name.lower() for name in bot.cogs.keys()} cog_names = cog_names.lower().split() try: to_load = set() for cog_name in cog_names: if cog_name not in loaded_cog_names: to_load.add(NAMES_COGS_MAP[cog_name]) else: raise ValueError except KeyError: error = InvalidArguments(ctx=ctx, message=f"{cog_name} not found") await error.execute() except ValueError: error = InvalidArguments(ctx=ctx, message=f"{cog_name} already loaded") await error.execute() else: load_cogs(bot, to_load) write(COG_PATH, [name.lower() for name in bot.cogs.keys()]) await ctx.send(f'Successfully loaded {", ".join(cog_names)}')
def menu_main(self): """Build the main menu.""" self.game = None # Print game name title, title_pos = write('Burglar', 124, 'Multicolore.otf', colorScheme.MAINMENUTITLE) title_pos.centerx = self.background.get_rect().centerx title_pos.centery = self.background.get_rect().height / 3 self.background.blit(title, title_pos) self.mm.assemble() self.mm.menu_pos.centery = 2 * (self.background.get_rect().height / 3) self.mm.menu_pos.centerx = self.background.get_rect().centerx self.background.blit(self.mm.menu, self.mm.menu_pos)