def test_threaded_reader(): r = Reader( thread_count=2, reader=FileReader, from_path='tests/data/tweets') df = r.to_pandas() assert len(df) == 50
def create_data_reader(context: dict): reader = Reader(project=context['config'].get('source_project'), from_path=context['config'].get('source_path'), extention=context['config'].get('source_extention'), data_format=context['config'].get('source_format'), date_range=(context.get('date'), context.get('date'))) return reader
def test_reader_writer(): do_writer() r = Reader(reader=FileReader, from_path='_tests/year_%Y/') l = len(list(r)) shutil.rmtree("_tests", ignore_errors=True) assert l == 200000, l
def test_reader_context(): counter = 0 with Reader(reader=FileReader, from_path='tests/data/tweets') as r: n = r.read_line() while n: counter += 1 n = r.read_line() assert counter == 50
def test_reader_writer_compressed(): do_writer_compressed() g = glob.glob('_tests/**/*.lzma') assert len(g) > 0, g r = Reader(reader=FileReader, from_path='_tests/year_%Y/') l = len(list(r)) shutil.rmtree("_tests", ignore_errors=True) assert l == 200000, l
def test_format_not_known(): failed = False try: reader = Reader(project='', select=['a', 'b'], from_path='', date_range=datetime.datetime.now(), data_format='excel') except TypeError: failed = True assert failed
def test_unknown_format(): failed = False try: r = Reader( reader=FileReader, from_path='tests/data/tweets', data_format='csv' ) except TypeError: failed = True assert failed
def test_reader_select_not_list(): failed = False try: reader = Reader(project='', select='everything', from_path='', date_range=(datetime.datetime.now(), datetime.datetime.now()), data_format='json') except TypeError: failed = True assert failed
def test_reader_where_not_callable(): failed = False try: reader = Reader(project='', select=['a', 'b'], from_path='', where=True, date_range=(datetime.datetime.now(), datetime.datetime.now()), data_format='json') except TypeError: failed = True assert failed
def test_reader_all_good(): failed = False try: reader = Reader(project='', select=['a', 'b'], from_path='', date_range=(datetime.datetime.now(), datetime.datetime.now()), data_format='json') except TypeError: failed = True assert not failed
def test_reader_to_pandas(): r = Reader(reader=FileReader, from_path='tests/data/tweets') df = r.to_pandas() assert len(df) == 50
def test_reader_can_read(): r = Reader( reader=FileReader, from_path='tests/data/tweets' ) assert len(list(r)) == 50
if v == maximum: print(k, '*' * int(((v / maximum) * width) // 1), maximum) else: print(k, '*' * int(((v / maximum) * width) // 1)) if __name__ == "__main__": reader = Reader( thread_count=0, #select=['username'], #from_path='TWITTER/tweets/%datefolders/', from_path='TWITTER/tweets/year_%Y/month_%m/day_%d/', #where=lambda r: r['username'] in ['realDonaldTrump', 'BillGates', 'Twitter', 'Amazon', 'NBCNews', 'BBCNews', 'CNNNews'], #where=lambda r: ('coronavirus' in r['tweet'].lower()) or ('corona virus' in r['tweet'].lower()) or ('corona-virus' in r['tweet'].lower()), #where=lambda r: ('joyce' in r['text'].lower()), reader=MinioReader, end_point=os.getenv('MINIO_END_POINT'), access_key=os.getenv('MINIO_ACCESS_KEY'), secret_key=os.getenv('MINIO_SECRET_KEY'), start_date=datetime.date(2021, 1, 2), end_date=datetime.date(2021, 1, 2), secure=False) # save = SaveToMinioOperator( # end_point=os.getenv('MINIO_END_POINT'), # access_key=os.getenv('MINIO_ACCESS_KEY'), # secret_key=os.getenv('MINIO_SECRET_KEY'), # to_path="TWITTER/tweets/%datefolders/reformatted_twitter_%date.jsonl", # secure=False, # compress=False)
try: from dotenv import load_dotenv # type:ignore from pathlib import Path env_path = Path('.') / '.env' load_dotenv(dotenv_path=env_path) except ImportError: pass reader = Reader( thread_count=4, reader=MinioReader, secure=False, end_point=os.getenv('MINIO_END_POINT'), access_key=os.getenv('MINIO_ACCESS_KEY'), secret_key=os.getenv('MINIO_SECRET_KEY'), from_path='SNAPSHOTS/NVD/NVD_CVE_LIST/%datefolders/', #data_format='text', #start_date=datetime.date(2020, 1, 30), #end_date=datetime.date(2020, 2, 5), #select=['username'], #where=lambda r: b'smb' in r ) #reader = dictset.limit(reader, 100) start = time.perf_counter_ns() count = 0 for count, item in enumerate(reader): pass print(count, (time.perf_counter_ns() - start)/1e9)
def create_data_reader(project, from_path, date): reader = Reader( project=project, from_path=from_path, date_range=(date, date)) return reader
""" Schema Guesser Reads through a dataset to 'guess' the schema. Current implementation only lists all the values in a set of fields to work out the set of symbols for an enumerated type. """ from gva.data import Reader from gva.data.formats import dictset import json reader = Reader( project='dcsgva-da-prd', from_path= 'dcsgva-da-prd-ai-notebook/02_INTERMEDIATE/VIEWS/NVD_CVE_SUMMARY/%datefolders/' ) values = {} for record in reader: for k, v in record.items(): if k not in [ 'CVE', 'CWE', 'publishedDate', 'Description', 'v2.0:vectorString', 'v2.0:baseScore', 'v2.0:exploitabilityScore', 'v2.0:impactScore', 'v3.0:vectorString', 'v3.0:baseScore', 'v3.0:exploitabilityScore', 'v3.0:impactScore' ]: