def generate_lsh_graph(data_set, num_hashes=3, num_bits=5, verbose=False): hashers = MultiLSHasher(num_hashes, num_bits) if verbose: print 'Hashers initialized' data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] doc_features = {} word_counts = Counter() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) word = int(row[1]) count = float(row[2]) word_counts[word] += 1 if doc not in doc_features: doc_features[doc] = [] doc_features[doc].append((word, count)) if verbose: print 'Loaded doc features' for doc, features in doc_features.items(): if type(features[0]) is float: break feature_tfidf = [] for w, c in features: tfidf = math.log(c + 1) * math.log( num_docs / float(word_counts[w])) feature_tfidf.append((w, tfidf)) doc_features[doc] = feature_tfidf hashers.compute_stream(doc_features) signatures = hashers.compute_signatures() if verbose: print 'Computed signatures' doc_features = {} words_doc_count = Counter() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) count = float(row[2]) if '.' in row[2] else int(row[2]) for hl, s in signatures.items(): word = str(row[1]) + hl + s[doc] words_doc_count[word] += 1 if doc not in doc_features: doc_features[doc] = [] doc_features[doc].append((word, count)) if verbose: print 'Generated hashed doc features' filename = '%s-lsh-h%db%d' % (data_set, num_hashes, num_bits) with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for doc, feature_counts in doc_features.items(): for feature, count in feature_counts: tfidf = math.log(count + 1) * math.log( num_docs / float(words_doc_count[feature])) datawriter.writerow([doc, feature, tfidf]) if verbose: print 'Wrote graph file %s' % filename
def generate_lsh_graph(data_set, num_hashes=3, num_bits=5, verbose=False): hashers = MultiLSHasher(num_hashes, num_bits) if verbose: print 'Hashers initialized' data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] doc_features = {} word_counts = Counter() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) word = int(row[1]) count = float(row[2]) word_counts[word] += 1 if doc not in doc_features: doc_features[doc] = [] doc_features[doc].append((word, count)) if verbose: print 'Loaded doc features' for doc, features in doc_features.items(): if type(features[0]) is float: break feature_tfidf = [] for w, c in features: tfidf = math.log(c+1) * math.log(num_docs/float(word_counts[w])) feature_tfidf.append((w,tfidf)) doc_features[doc] = feature_tfidf hashers.compute_stream(doc_features) signatures = hashers.compute_signatures() if verbose: print 'Computed signatures' doc_features = {} words_doc_count = Counter() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) count = float(row[2]) if '.' in row[2] else int(row[2]) for hl, s in signatures.items(): word = str(row[1]) + hl + s[doc] words_doc_count[word] += 1 if doc not in doc_features: doc_features[doc] = [] doc_features[doc].append((word, count)) if verbose: print 'Generated hashed doc features' filename = '%s-lsh-h%db%d' % (data_set, num_hashes, num_bits) with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for doc, feature_counts in doc_features.items(): for feature, count in feature_counts: tfidf = math.log(count+1) * math.log(num_docs/float( words_doc_count[feature])) datawriter.writerow([doc, feature, tfidf]) if verbose: print 'Wrote graph file %s' % filename
def setUp(self): self.file = open_data_file('manifest.rdf') self.manifest = Manifest(self.file) self.suite = RDFTestSuite.from_manifest(self.manifest, opener=NULL_OPENER) self.result = unittest.TestResult() self.suite.run(self.result)
def gen_lsh(num_hashes, num_bits, verbose=True): # first hash labelled 'a', second labelled 'b', etc hashers = MultiLSHasher(num_hashes, num_bits) print '%d hashes, %d bits' % (num_hashes, num_bits) if verbose: print 'Hashers initialized' doc_features = {} words_doc_count = [0 for i in xrange(util.get_num_features(data_set)+1)] with util.open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) word = int(row[1]) count = int(row[2]) words_doc_count[word] += 1 if doc not in doc_features: doc_features[doc] = [] doc_features[doc].append((word, count)) if verbose: print 'Loaded doc features' hashers.compute_stream(doc_features) signatures = hashers.compute_signatures() if verbose: print 'Computed signatures' hd = {} for hl, s in signatures.items(): for doc, sig in s.items(): h = hl + sig if h not in hd: hd[h] = 0 hd[h] += 1 return hd
def gen_lsh(num_hashes, num_bits, verbose=True): # first hash labelled 'a', second labelled 'b', etc hashers = MultiLSHasher(num_hashes, num_bits) print '%d hashes, %d bits' % (num_hashes, num_bits) if verbose: print 'Hashers initialized' doc_features = {} words_doc_count = [0 for i in xrange(util.get_num_features(data_set) + 1)] with util.open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) word = int(row[1]) count = int(row[2]) words_doc_count[word] += 1 if doc not in doc_features: doc_features[doc] = [] doc_features[doc].append((word, count)) if verbose: print 'Loaded doc features' hashers.compute_stream(doc_features) signatures = hashers.compute_signatures() if verbose: print 'Computed signatures' hd = {} for hl, s in signatures.items(): for doc, sig in s.items(): h = hl + sig if h not in hd: hd[h] = 0 hd[h] += 1 return hd
def generate_graph(data_set): with util.open_data_file(data_set) as data: with open(path.join(PROPPR_PROGRAM_DIR, data_set + '.graph'), 'w') as f: for line in data: doc, feature, weight = line.split() f.write('\t'.join(['hasWord', 'd' + doc, 'w' + feature]) + '\n')
def setUp(self): if getattr(self, 'tag', None) is not None: xml = open_data_file('manifest.rdf').read() self.manifest = ElementTree.XML(xml) self.element = self.manifest.find(str(self.tag)) self.test = Test.from_element(self.element) else: self.skipTest("'tag' attribute not set. abstract test case?")
def testcases(): manifest = Manifest(open_data_file('rdfcore/Manifest.rdf')) rdf_suite = RDFTestSuite.from_manifest(manifest, opener=TEST_OPENER) if not TEST_URIS: return rdf_suite else: suite = RDFTestSuite() for test in rdf_suite: if test.id() in TEST_URIS: suite.addTest(test) return suite
def get_doc_features(data_set): doc_features = {} with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) word = int(row[1]) count = float(row[2]) if '.' in row[2] else int(row[2]) if doc not in doc_features: doc_features[doc] = [] doc_features[doc].append((word, count)) return doc_features
def load_data(data_set): num_docs = util.get_num_docs(data_set) num_feats = util.get_num_features(data_set) data = np.zeros((num_docs, num_feats)) with util.open_data_file(data_set) as data_file: data_reader = csv.reader(data_file, delimiter=' ') for row in data_reader: doc = int(row[0]) - 1 word = int(row[1]) - 1 count = int(row[2]) data[doc][word] = count return data
def make_small_data_set(data_set, num_docs, labels): small_set = 's' + ''.join(map(str, labels)) + '_' + data_set doc_features = util.get_doc_features(data_set) label_docs = util.get_label_docs(data_set) samp_size = num_docs / len(labels) with util.open_data_file(small_set, 'wb') as data: datawriter = csv.writer(data, delimiter=' ') for label in labels: docs = random.sample(label_docs[label], samp_size) for doc in docs: for feature, count in doc_features[doc].items(): datawriter.writerow([doc, feature, count]) util.duplicate_label_file(data_set, small_set) util.duplicate_count_file(data_set, small_set) print('Smaller dataset with labels [%s] and %d docs created from %s.' % (','.join(map(str, labels)), num_docs, data_set))
def make_small_data_set(data_set, num_docs, labels): small_set = 's' + ''.join(map(str, labels)) + '_' + data_set doc_features = util.get_doc_features(data_set) label_docs = util.get_label_docs(data_set) samp_size = num_docs / len(labels) with util.open_data_file(small_set, 'wb') as data: datawriter = csv.writer(data, delimiter=' ') for label in labels: docs = random.sample(label_docs[label], samp_size) for doc in docs: for feature, count in doc_features[doc].items(): datawriter.writerow([ doc, feature, count ]) util.duplicate_label_file(data_set, small_set) util.duplicate_count_file(data_set, small_set) print('Smaller dataset with labels [%s] and %d docs created from %s.' % (','.join(map(str, labels)), num_docs, data_set))
def setUp(self): self.doc = Document(TEST['NT-Document'], TESTS['datatypes-intensional/test002.nt']) self.file = open_data_file('rdfcore/datatypes-intensional/test002.nt') self.reader = NTriplesReader() super().setUp()
def generate_knn_graph(data_set, k, verbose=False): data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] assert k < num_docs feature_matrix = np.matrix(np.zeros((num_docs, num_features))) words_doc_count = np.zeros(num_features) is_tfidf = False docs = set() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) - 1 word = int(row[1]) - 1 if is_tfidf: count = float(row[2]) elif '.' in row[2]: count = float(row[2]) is_tfidf = True else: count = int(row[2]) words_doc_count[word] += 1 docs.add(doc) feature_matrix.itemset((doc, word), count) if verbose: print 'Loaded test data' if verbose: print 'Generating feature matrix' if not is_tfidf: for doc in xrange(num_docs): if doc in docs: for word in xrange(num_features): if words_doc_count[word] != 0: count = feature_matrix.item((doc,word)) tfidf = math.log(count+1) * math.log(num_docs/float(words_doc_count[word])) feature_matrix.itemset((doc,word), tfidf) if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs) if verbose: print 'Generated feature matrix' normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs))) for i in xrange(num_docs): f = feature_matrix[i] fft = math.sqrt(f * f.transpose()) if fft < 1e-9: normalizing_matrix.itemset((i,i), 0.0) else: normalizing_matrix.itemset((i,i), 1.0 / fft) if verbose: print 'Generated normalizing matrix' if verbose: print 'Generating folded graph' edges = [] N = normalizing_matrix F = feature_matrix for doc in xrange(num_docs): Nv = np.matrix(np.zeros((num_docs,1))) Nv.itemset(doc, N.item((doc, doc))) FtNv = F[doc].transpose() * N.item((doc,doc)) doc_weights = np.array(N * (F * FtNv)).transpose() nearest_neighbors = np.argsort(doc_weights) for neighbor in nearest_neighbors[0][-k:]: if doc_weights.item(neighbor) < 1e-9: continue edges.append(((doc+1, int(neighbor)+1), doc_weights.item(neighbor))) if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs) if verbose: print 'Generated folded graph' filename = '%s-knn-k%d' % (data_set, k) with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for edge, weight in edges: datawriter.writerow([edge[0], edge[1], weight]) if verbose: print 'Wrote graph file %s' % filename
def setUp(self): self.doc = Document(TEST['RDF-XML-Document'], TESTS['datatypes/test001.rdf']) self.file = open_data_file('rdfcore/datatypes/test001.rdf') self.reader = RDFXMLReader() super().setUp()
def setUp(self): self.string = open_data_file('manifest.rdf').read() self.manifest = Manifest(self.string)
def setUp(self): self.file = open_data_file('manifest.rdf') self.manifest = Manifest(self.file)
def generate_knn_graph(data_set, k, verbose=False): data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] assert k < num_docs feature_matrix = np.matrix(np.zeros((num_docs, num_features))) words_doc_count = np.zeros(num_features) is_tfidf = False docs = set() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) - 1 word = int(row[1]) - 1 if is_tfidf: count = float(row[2]) elif '.' in row[2]: count = float(row[2]) is_tfidf = True else: count = int(row[2]) words_doc_count[word] += 1 docs.add(doc) feature_matrix.itemset((doc, word), count) if verbose: print 'Loaded test data' if verbose: print 'Generating feature matrix' if not is_tfidf: for doc in xrange(num_docs): if doc in docs: for word in xrange(num_features): if words_doc_count[word] != 0: count = feature_matrix.item((doc, word)) tfidf = math.log(count + 1) * math.log( num_docs / float(words_doc_count[word])) feature_matrix.itemset((doc, word), tfidf) if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc + 1, num_docs) if verbose: print 'Generated feature matrix' normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs))) for i in xrange(num_docs): f = feature_matrix[i] fft = math.sqrt(f * f.transpose()) if fft < 1e-9: normalizing_matrix.itemset((i, i), 0.0) else: normalizing_matrix.itemset((i, i), 1.0 / fft) if verbose: print 'Generated normalizing matrix' if verbose: print 'Generating folded graph' edges = [] N = normalizing_matrix F = feature_matrix for doc in xrange(num_docs): Nv = np.matrix(np.zeros((num_docs, 1))) Nv.itemset(doc, N.item((doc, doc))) FtNv = F[doc].transpose() * N.item((doc, doc)) doc_weights = np.array(N * (F * FtNv)).transpose() nearest_neighbors = np.argsort(doc_weights) for neighbor in nearest_neighbors[0][-k:]: if doc_weights.item(neighbor) < 1e-9: continue edges.append( ((doc + 1, int(neighbor) + 1), doc_weights.item(neighbor))) if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc + 1, num_docs) if verbose: print 'Generated folded graph' filename = '%s-knn-k%d' % (data_set, k) with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for edge, weight in edges: datawriter.writerow([edge[0], edge[1], weight]) if verbose: print 'Wrote graph file %s' % filename
def generate_knn_graphs(data_set, ks=[5, 10, 20, 30, 50, 100], verbose=False): ''' since we get a list of *all* the neighbors ordered by "nearness", it makes more sense to iterate through the different k's within the function rather than outside it ''' data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] max_k = max(ks) assert max_k < num_docs feature_matrix = np.matrix(np.zeros((num_docs, num_features))) words_doc_count = np.zeros(num_features) is_tfidf = False docs = set() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) - 1 word = int(row[1]) - 1 if is_tfidf: count = float(row[2]) elif '.' in row[2]: count = float(row[2]) is_tfidf = True else: count = int(row[2]) words_doc_count[word] += 1 docs.add(doc) feature_matrix.itemset((doc, word), count) if verbose: print 'Loaded test data' if verbose: print 'Generating feature matrix' if not is_tfidf: for doc in xrange(num_docs): if doc in docs: for word in xrange(num_features): if words_doc_count[word] != 0: count = feature_matrix.item((doc, word)) tfidf = math.log(count + 1) * math.log( num_docs / float(words_doc_count[word])) feature_matrix.itemset((doc, word), tfidf) if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc + 1, num_docs) if verbose: print 'Generated feature matrix' normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs))) for i in xrange(num_docs): f = feature_matrix[i] fft = math.sqrt(f * f.transpose()) if fft < 1e-9: normalizing_matrix.itemset((i, i), 0.0) else: normalizing_matrix.itemset((i, i), 1.0 / fft) if verbose: print 'Generated normalizing matrix' if verbose: print 'Generating folded graph' edges = [] N = normalizing_matrix F = feature_matrix doc_neighbors = {} for doc in xrange(num_docs): Nv = np.matrix(np.zeros((num_docs, 1))) Nv.itemset(doc, N.item((doc, doc))) FtNv = F[doc].transpose() * N.item((doc, doc)) doc_weights = np.array(N * (F * FtNv)).transpose() neighbors = np.argsort(doc_weights)[0] doc_neighbors[doc] = [(neighbor, doc_weights.item(neighbor)) for neighbor in neighbors[-max_k:]] if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc + 1, num_docs) if verbose: print 'Generated folded graph' for k in ks: filename = '%s-knn-k%d' % (data_set, k) with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for doc in xrange(num_docs): for neighbor, weight in doc_neighbors[doc][-k:]: if weight >= 1e-9: datawriter.writerow( [str(doc + 1), str(neighbor + 1), weight]) if verbose: print 'Wrote graph file %s' % filename
sense = SenseHat() sense.low_light = True sense.set_rotation(cfg.ledRotation) sense.clear() # Temperature cpu_temps = [util.get_cpu_temperature()] * 5 iot_data = {} iot_data_tph = {} iot_data_accel = {} iot_data_gyro = {} iot_data_compass = {} fh = util.open_data_file(dataFile) fh_tph = util.open_data_file(dataFileTph) fh_accel = util.open_data_file(dataFileAccel) fh_gyro = util.open_data_file(dataFileGyro) fh_compass = util.open_data_file(dataFileCompass) sprite = Sprites() try: print("Sense HAT: Running (" + cfg.profile + ")\n") print("To monitor, use:") print("\t" + "$ watch -n 1 cat " + dataFile) print("\t" + "$ tail -f " + logFile) while True: iot_data.update(util.get_current_time())
def setUp(self): self.handler = URItoFileHandler(PATH_MAP) self.manifest_file = open_data_file('rdfcore/Manifest.rdf')
def setUp(self): self.reader = NTriplesReader() self.triples = self.reader.read(open_data_file('test.nt'))
def generate_knn_graphs(data_set, ks=[5,10,20,30,50,100], verbose=False): ''' since we get a list of *all* the neighbors ordered by "nearness", it makes more sense to iterate through the different k's within the function rather than outside it ''' data_counts = get_counts(data_set) num_docs = data_counts[0] num_features = data_counts[1] max_k = max(ks) assert max_k < num_docs feature_matrix = np.matrix(np.zeros((num_docs, num_features))) words_doc_count = np.zeros(num_features) is_tfidf = False docs = set() with open_data_file(data_set) as data: datareader = csv.reader(data, delimiter=' ') for row in datareader: doc = int(row[0]) - 1 word = int(row[1]) - 1 if is_tfidf: count = float(row[2]) elif '.' in row[2]: count = float(row[2]) is_tfidf = True else: count = int(row[2]) words_doc_count[word] += 1 docs.add(doc) feature_matrix.itemset((doc, word), count) if verbose: print 'Loaded test data' if verbose: print 'Generating feature matrix' if not is_tfidf: for doc in xrange(num_docs): if doc in docs: for word in xrange(num_features): if words_doc_count[word] != 0: count = feature_matrix.item((doc,word)) tfidf = math.log(count+1) * math.log(num_docs/float(words_doc_count[word])) feature_matrix.itemset((doc,word), tfidf) if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs) if verbose: print 'Generated feature matrix' normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs))) for i in xrange(num_docs): f = feature_matrix[i] fft = math.sqrt(f * f.transpose()) if fft < 1e-9: normalizing_matrix.itemset((i,i), 0.0) else: normalizing_matrix.itemset((i,i), 1.0 / fft) if verbose: print 'Generated normalizing matrix' if verbose: print 'Generating folded graph' edges = [] N = normalizing_matrix F = feature_matrix doc_neighbors = {} for doc in xrange(num_docs): Nv = np.matrix(np.zeros((num_docs,1))) Nv.itemset(doc, N.item((doc, doc))) FtNv = F[doc].transpose() * N.item((doc,doc)) doc_weights = np.array(N * (F * FtNv)).transpose() neighbors = np.argsort(doc_weights)[0] doc_neighbors[doc] = [(neighbor, doc_weights.item(neighbor)) for neighbor in neighbors[-max_k:]] if doc % 10 == 9: if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs) if verbose: print 'Generated folded graph' for k in ks: filename = '%s-knn-k%d' % (data_set, k) with open_graph_file(filename) as graph: datawriter = csv.writer(graph, delimiter='\t') for doc in xrange(num_docs): for neighbor, weight in doc_neighbors[doc][-k:]: if weight >= 1e-9: datawriter.writerow([str(doc+1), str(neighbor+1), weight]) if verbose: print 'Wrote graph file %s' % filename
state = GPIO.input(pin_num) if state == False: data["detected"] = True else: data["detected"] = False return data # ------------------------------------------------------------------------ # Main # ------------------------------------------------------------------------ iot_data = {} iot_data_water = {} fh = util.open_data_file(dataFile) fh_water = util.open_data_file(dataFileWater) try: print("Sabre Water Leak Alarm: Running (" + cfg.profile + ") on pin " + str(cfg.gpioPinWaterLeakAlarm) + "\n") print("To monitor, use:") print("\t" + "$ watch -n 1 cat " + dataFile) print("\t" + "$ tail -f " + logFile) GPIO.setmode(GPIO.BCM) GPIO.setup(cfg.gpioPinWaterLeakAlarm, GPIO.IN, pull_up_down=GPIO.PUD_UP) while True: iot_data.update(util.get_current_time())
except: err = sys.exc_info()[0] logging.error(err) return data # ------------------------------------------------------------------------ # Main # ------------------------------------------------------------------------ iot_data = {} iot_data_gps = {} fh = util.open_data_file(dataFile) fh_gps = util.open_data_file(dataFileGps) try: print("U-blox7 GPS/GLONASS: Running (" + cfg.profile + ")\n") print("To monitor, use:") print("\t" + "$ watch -n 1 cat " + dataFile) print("\t" + "$ tail -f " + logFile) # Use IP geolocation as a fallback geo_data = get_geo() iot_data.update(geo_data) iot_data["geo"] = iot_data["assetloc"] iot_data_gps["assetloc"] = iot_data["assetloc"] json_all = json.dumps(iot_data, separators=(',', ':'))
#fontFamily = "fonts/segoeui.ttf" font = ImageFont.truetype(fontFamily, 22) smallfont = ImageFont.truetype(fontFamily, 12) fg_color = (255, 255, 255) bg_color = (0, 0, 0) # Temperature cpu_temps = [util.get_cpu_temperature()] * 5 tph = BME280() iot_data = {} iot_data_tph = {} iot_data_light = {} iot_data_gas = {} fh = util.open_data_file(dataFile) fh_tph = util.open_data_file(dataFileTph) fh_light = util.open_data_file(dataFileLight) fh_gas = util.open_data_file(dataFileGas) try: print("Enviro+ pHAT: Running (" + cfg.profile + ")\n") print("To monitor, use:") print("\t" + "$ watch -n 1 cat " + dataFile) print("\t" + "$ tail -f " + logFile) while True: iot_data.update(util.get_current_time()) iot_data.update(get_temperature()) iot_data.update(get_pressure())