def preprocess_memex_data_sources(self, folder_path): source_map = OrderedDict() for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) print(file_path) with open(file_path, "r") as f: for json_line in f.readlines(): json_obj = json.loads(json_line) source_name = json_obj["tld"] if source_name not in source_map: source_map[source_name] = Source(source_name) source = source_map[source_name] for attr in json_obj: if attr.startswith("inferlink"): attr_name = attr.split("_")[1] if attr_name not in source.column_map: source.column_map[attr_name] = Column( attr_name, source.name) source.column_map[ attr_name].semantic_type = attr_name for ele1 in json_obj[attr]: if isinstance(ele1["result"], dict): source.column_map[attr_name].add_value( ele1["result"]["value"]) else: for ele2 in ele1["result"]: source.column_map[attr_name].add_value( ele2["value"]) for source in source_map.values(): if source.column_map: source.write_csv_file("data/datasets/memex/%s" % source.name)
def add_semantic_type(column=None, semantic_type=None): if not (column and semantic_type): column = request.json["column"] semantic_type = request.json["semantic_type"] column_name = column.keys()[0] if column and semantic_type and column_name: source = Source(column_name) source.read_data_from_dict(column) source.set_semantic_type(semantic_type, column_name) _id = get_new_index_name(semantic_type, column_name) source.save(index_config={"name": _id, "size": 0}) return str(_id) """
def read_data_sources(self, folder_path): data_folder_path = os.path.join(folder_path, "data") model_folder_path = os.path.join(folder_path, "model") for filename in os.listdir(data_folder_path): extension = os.path.splitext(filename)[1] source = Source(os.path.splitext(filename)[0], self.sc) file_path = os.path.join(data_folder_path, filename) if extension == ".csv": source.read_data_from_csv(file_path) elif extension == ".json": source.read_data_from_json(file_path) elif extension == ".xml": source.read_data_from_xml(file_path) self.source_map[filename] = source for filename in os.listdir(model_folder_path): source = self.source_map[os.path.splitext(os.path.splitext(filename)[0])[0]] source.read_semantic_type_json(os.path.join(model_folder_path, filename))
def add_semantic_type(column=None, semantic_type=None): try: if not (column and semantic_type): column = request.json["column"] semantic_type = request.json["semantic_type"] logging.info("Adding semantic type: {}".format(semantic_type)) column_name = column.keys()[0] if column and semantic_type and column_name: source = Source(column_name) source.read_data_from_dict(column) source.set_semantic_type(semantic_type, column_name) _id = get_new_index_name(semantic_type, column_name) source.save(index_config={"name": _id, "size": 0}) resp = jsonify({"index_name": _id}) resp.status_code = 200 return resp except Exception as e: return error("Semantic type adding failed: {}".format(e.args))
def configure(self): sources = Config.options('sources') if len(sources) < 1: raise RuntimeError('At least one Source must be configured!') self.mixer = Mixer() self.sources = [] for name, url in Config.items('sources'): source = Source(name, url) self.mixer.append(source) self.sources.append(source) self.mixer.configure() if Config.has_option('output', 'rtmp_push_url'): rtmp_push_url = Config.get('output', 'rtmp_push_url') self.sink = RtmpSink(rtmp_push_url, self.mixer.output_width, self.mixer.output_height) else: self.sink = LocalSink(self.mixer.output_width, self.mixer.output_height)
def setUp(self): self.fd, self.data_file = tempfile.mkstemp() os.unlink(self.data_file) self.source = Source(self.data_file)
class Test_Source(unittest.TestCase): def setUp(self): self.fd, self.data_file = tempfile.mkstemp() os.unlink(self.data_file) self.source = Source(self.data_file) def tearDown(self): os.close(self.fd) def test__read_from_file(self): expected = {'test_key': 'test_value'} args = ['level1', 'level2'] with open(self.data_file, 'w') as f: f.write(json.dumps({args[0]: {args[1]: expected}})) actual = self.source._read_from_file(*args) self.assertEquals(expected, actual) def test__read_from_file_no_args(self): expected = {'test_key': 'test_value'} with open(self.data_file, 'w') as f: f.write(json.dumps(expected)) actual = self.source._read_from_file() self.assertEquals(expected, actual) def test__read_from_file_file_does_not_exist(self): with self.assertRaises(IOError): self.source._read_from_file() def test__read_from_file_file_corrupt(self): with open(self.data_file, 'w') as f: f.write('corrupt file') with self.assertRaises(ValueError): self.source._read_from_file() def test__read_from_file_bad_key(self): with open(self.data_file, 'w') as f: f.write(json.dumps({})) with self.assertRaises(KeyError): self.source._read_from_file('key') def test__write_to_file(self): key = 'test_key' value = 'test_value' args = ['test_arg1', 'test_arg2'] expected = {args[0]: {args[1]: {key: value}}} self.source._write_to_file(key, value, *args) with open(self.data_file) as f: actual = json.loads(f.read()) self.assertEquals(expected, actual) def test__write_to_file_no_args(self): key = 'test_key' value = 'test_value' expected = {key: value} self.source._write_to_file(key, value) with open(self.data_file) as f: actual = json.loads(f.read()) self.assertEquals(expected, actual) def test__write_to_file_file_exists(self): new_key = 'test_key' new_value = 'test_value' old_key = 'old_key' old_value = 'old_value' expected = {new_key: new_value, old_key: old_value} with open(self.data_file, 'w') as f: f.write(json.dumps({old_key: old_value})) self.source._write_to_file(new_key, new_value) with open(self.data_file) as f: actual = json.loads(f.read()) self.assertEquals(expected, actual) def test__write_to_file_file_corrupt(self): key = 'test_key' value = 'test_value' expected = {key: value} with open(self.data_file, 'w') as f: f.write('corrupted_data') self.source._write_to_file(key, value) with open(self.data_file) as f: actual = json.loads(f.read()) self.assertEquals(expected, actual)
def read_data_sources(self, folder_paths): logging.info("Reading data sources...") for folder_name in folder_paths: folder_path = os.path.join(self.data_folder, folder_name) logging.info("-->folder: {}".format(folder_path)) source_map = OrderedDict() data_folder_path = os.path.join(folder_path, "data") model_folder_path = os.path.join(folder_path, "model") for filename in os.listdir(data_folder_path): extension = os.path.splitext(filename)[1] if ".DS" in filename: continue logging.info(" ...file: {}".format(filename)) print(filename) source = Source(os.path.splitext(filename)[0]) file_path = os.path.join(data_folder_path, filename) if "full" in data_folder_path: source.read_data_from_wc_csv(file_path) elif extension == ".csv": source.read_data_from_csv(file_path) elif extension == ".json": source.read_data_from_json(file_path) elif extension == ".xml": source.read_data_from_xml(file_path) else: source.read_data_from_text_file(file_path) source_map[filename] = source if os.path.exists(model_folder_path): for filename in os.listdir(model_folder_path): if ".DS" in filename: continue try: source = source_map[os.path.splitext( os.path.splitext(filename)[0])[0]] except: source = source_map[filename] extension = os.path.splitext(filename)[1] if extension == ".json": source.read_semantic_type_json( os.path.join(model_folder_path, filename)) else: print(source) source.read_semantic_type_from_gold( os.path.join(model_folder_path, filename)) self.dataset_map[folder_name] = source_map
def read_data_sources(self, folder_paths): semantic_type_set = set() attr_count = 0 for folder_name in folder_paths: self.logger.debug("Read dataset: %s", folder_name) folder_path = "data/datasets/%s" % folder_name source_map = OrderedDict() data_folder_path = os.path.join(folder_path, "data") model_folder_path = os.path.join(folder_path, "model") for filename in sorted(os.listdir(data_folder_path)): extension = os.path.splitext(filename)[1] if ".DS" in filename: continue self.logger.debug(" -> read: %s", filename) source = Source(os.path.splitext(filename)[0]) file_path = os.path.join(data_folder_path, filename) if "full" in data_folder_path: source.read_data_from_wc_csv(file_path) elif extension == ".csv": source.read_data_from_csv(file_path) elif extension == ".json": source.read_data_from_json(file_path) elif extension == ".xml": source.read_data_from_xml(file_path) else: source.read_data_from_text_file(file_path) source_map[filename] = source # NOTE: BINH delete empty columns here!!!, blindly follows the code in indexer:36 for key in list(source.column_map.keys()): column = source.column_map[key] if column.semantic_type: if len(column.value_list) == 0: del source.column_map[key] source.empty_val_columns[key] = column logging.warning("Indexer: IGNORE COLUMN `%s` in source `%s` because of empty values", column.name, source.name) for column in source.column_map.values(): semantic_type_set.add(column.semantic_type) attr_count += len(source.column_map.values()) if os.path.exists(model_folder_path): for filename in os.listdir(model_folder_path): if ".DS" in filename: continue try: source = source_map[os.path.splitext(os.path.splitext(filename)[0])[0]] except: source = source_map[filename] extension = os.path.splitext(filename)[1] if extension == ".json": source.read_semantic_type_json(os.path.join(model_folder_path, filename)) else: print source source.read_semantic_type_from_gold(os.path.join(model_folder_path, filename)) self.dataset_map[folder_name] = source_map # print semantic_type_set print len(semantic_type_set) print attr_count
def read_data_sources(self, folder_paths): semantic_type_set = set() attr_count = 0 for folder_name in folder_paths: self.logger.debug("Read dataset: %s", folder_name) folder_path = "data/datasets/%s" % folder_name source_map = OrderedDict() data_folder_path = os.path.join(folder_path, "tables") model_folder_path = os.path.join(folder_path, "models") for filename in sorted(os.listdir(data_folder_path)): extension = os.path.splitext(filename)[1] if ".DS" in filename: continue self.logger.debug(" -> read: %s", filename) source = Source(os.path.splitext(filename)[0]) file_path = os.path.join(data_folder_path, filename) if "full" in data_folder_path: source.read_data_from_wc_csv(file_path) elif extension == ".csv": source.read_data_from_csv(file_path) elif extension == ".json": source.read_data_from_json(file_path) elif extension == ".xml": source.read_data_from_xml(file_path) else: source.read_data_from_text_file(file_path) source_map[filename] = source if ('rowNumber' in source.column_map): del source.column_map['rowNumber'] # NOTE: BINH delete empty columns here!!!, blindly follows the code in indexer:36 for key in list(source.column_map.keys()): column = source.column_map[key] if column.semantic_type: if len(column.value_list) == 0: del source.column_map[key] source.empty_val_columns[key] = column logging.warning( "Indexer: IGNORE COLUMN `%s` in source `%s` because of empty values", column.name, source.name) for column in source.column_map.values(): semantic_type_set.add(column.semantic_type) attr_count += len(source.column_map.values()) if os.path.exists(model_folder_path): for filename in os.listdir(model_folder_path): if ".DS" in filename: continue try: source = source_map[os.path.splitext( os.path.splitext(filename)[0])[0]] except: source = source_map[filename] extension = os.path.splitext(filename)[1] if extension == ".json": source.read_semantic_type_json( os.path.join(model_folder_path, filename)) else: print(source) source.read_semantic_type_from_gold( os.path.join(model_folder_path, filename)) self.dataset_map[folder_name] = source_map # print semantic_type_set print(len(semantic_type_set)) print(attr_count)