def test_multiplexing(self): with open('./test/resources/multiplex.csv') as f: reader = casanova.reader(f, multiplex=('colors', '|')) rows = list(reader) assert rows == [ ['John', 'blue'], ['John', 'yellow'], ['John', 'orange'], ['Mary', 'purple'], ['Mary', 'blue'], ['Eustache', ''], ['Lizbeth', 'cyan'] ] with open('./test/resources/multiplex.csv') as f: reader = casanova.reader(f, multiplex=('colors', '|', 'color?')) cells = list(reader.cells('color?')) assert cells == ['blue', 'yellow', 'orange', 'purple', 'blue', '', 'cyan'] with open('./test/resources/multiplex.csv') as f: reader = casanova.reader(f, multiplex=('colors', '|'), prebuffer_bytes=1024) assert reader.total == 7
def url_join_action(namespace): right_reader = casanova.reader(namespace.file2) left_reader = casanova.reader(namespace.file1, namespace.output) output_file = open_output_file(namespace.output) output_writer = csv.writer(output_file) left_headers = left_reader.fieldnames left_indices = None if namespace.select is not None: selected = namespace.select.split(',') left_headers = [h for h in left_headers if h in selected] left_indices = collect_column_indices(left_reader.pos, left_headers) empty = [''] * len(left_headers) output_writer.writerow(right_reader.fieldnames + left_headers) loading_bar = tqdm(desc='Indexing left file', dynamic_ncols=True, unit=' lines') # First step is to index left file trie = NormalizedLRUTrie(strip_trailing_slash=True) for row, url in left_reader.cells(namespace.column1, with_rows=True): url = url.strip() if left_indices is not None: row = [row[i] for i in left_indices] trie.set(url, row) loading_bar.update() loading_bar.close() loading_bar = tqdm(desc='Matching right file', dynamic_ncols=True, unit=' lines') for row, url in right_reader.cells(namespace.column2, with_rows=True): url = url.strip() match = None if url: match = trie.match(url) loading_bar.update() if match is None: output_writer.writerow(row + empty) continue row.extend(match) output_writer.writerow(row) output_file.close()
def test_gzip(self): with gzip.open('./test/resources/people.csv.gz', 'rt') as f: reader = casanova.reader(f) names = [name for name in reader.cells('name')] assert names == ['John', 'Mary', 'Julia'] with casanova.reader('./test/resources/people.csv.gz') as reader: names = [name for name in reader.cells('name')] assert names == ['John', 'Mary', 'Julia']
def test_exceptions(self): with pytest.raises(EmptyFileError): casanova.reader(StringIO('')) with pytest.raises(TypeError): casanova.reader(StringIO('name\nYomgui'), buffer=4.5) with pytest.raises(TypeError): casanova.reader(StringIO('name\nYomgui'), buffer=-456) with pytest.raises(TypeError, match='multiplex'): casanova.reader(StringIO('name\nYomgui'), multiplex=(45, 'test')) with pytest.raises(MissingColumnError): casanova.reader(StringIO('name\nYomgui'), multiplex=('surname', 'test'))
def test_enumerate(self): with open('./test/resources/people.csv') as f: reader = casanova.reader(f) indices = [i for i, row in reader.enumerate()] assert indices == list(range(3))
def sample_tweets(f, index, outdir, total=None, samples_sizes=[100]): casa = casanova.reader(f) samples_indexes = defaultdict(dict) outfiles = {} for siz in samples_sizes: for period, indexes in sorted(index.items()): lentweets = len(indexes) ntweets = min(siz, lentweets) outf = os.path.join(outdir, "%s_sample_%s.csv" % (period, siz)) print("-", period, ":", lentweets, "filtered tweets to", ntweets, "->", outf) outfiles[outf] = {"file": open(outf, "w")} outfiles[outf]["writer"] = csv.writer(outfiles[outf]["file"]) outfiles[outf]["writer"].writerow(casa.fieldnames) for i in sample(indexes, ntweets): samples_indexes[siz][i] = outfiles[outf]["writer"] try: for i, row in enumerate(tqdm(casa, total=total)): for siz, idx in samples_indexes.items(): if i in idx: idx[i].writerow(row) except Exception as e: print("ERROR sampling while working on row #%s:" % i, row, file=sys.stderr) raise (e) for outf in outfiles: outfiles[outf]["file"].close()
def test_basics(self): with open('./test/resources/people.csv') as f: reader = casanova.reader(f) assert reader.row_len == 2 assert reader.headers.name == 0 assert reader.headers.surname == 1 assert 'name' in reader.headers assert 'whatever' not in reader.headers assert reader.headers['name'] == 0 assert reader.headers['surname'] == 1 assert reader.headers.get('name') == 0 assert reader.headers.get('whatever') is None assert reader.headers.get('whatever', 1) == 1 assert len(reader.headers) == 2 assert reader.fieldnames == ['name', 'surname'] assert list(reader.headers) == [('name', 0), ('surname', 1)] assert dict(list(reader.headers)) == {'name': 0, 'surname': 1} assert reader.headers.as_dict() == {'name': 0, 'surname': 1} with pytest.raises(KeyError): reader.headers['whatever'] surnames = [row[reader.headers.surname] for row in reader] assert surnames == ['Matthews', 'Sue', 'Stone']
def url_join_action(cli_args): left_reader = casanova.reader(cli_args.file1) left_headers = left_reader.fieldnames left_idx = None if cli_args.select: left_idx = left_reader.pos.collect(cli_args.select) left_headers = list(cli_args.select) # Applying column prefix now left_headers = [cli_args.match_column_prefix + h for h in left_headers] right_enricher = casanova.enricher(cli_args.file2, cli_args.output, add=left_headers) loading_bar = LoadingBar(desc='Indexing left file', unit='line') # First step is to index left file trie = NormalizedLRUTrie() for row, cell in left_reader.cells(cli_args.column1, with_rows=True): loading_bar.update() if left_idx is not None: row = [row[i] for i in left_idx] urls = [cell] if cli_args.separator is not None: urls = cell.split(cli_args.separator) for url in urls: url = url.strip() # NOTE: should we filter invalid urls here? if url: trie.set(url, row) loading_bar.close() loading_bar = LoadingBar(desc='Matching right file', unit='line') for row, url in right_enricher.cells(cli_args.column2, with_rows=True): loading_bar.update() url = url.strip() match = None # NOTE: should we filter invalid urls here? if url: match = trie.match(url) if match is None: right_enricher.writerow(row) continue right_enricher.writerow(row, match)
def test_no_headers(self): with open('./test/resources/no_headers.csv') as f: reader = casanova.reader(f, no_headers=True) assert reader.fieldnames is None surnames = [row[1] for row in reader] assert surnames == ['Matthews', 'Sue', 'Stone']
def test_cells(self): with open('./test/resources/people.csv') as f: reader = casanova.reader(f) with pytest.raises(MissingColumnError): reader.cells('whatever') names = [name for name in reader.cells('name')] assert names == ['John', 'Mary', 'Julia'] with open('./test/resources/people.csv') as f: reader = casanova.reader(f) names = [(row[1], name) for row, name in reader.cells('name', with_rows=True)] assert names == [('Matthews', 'John'), ('Sue', 'Mary'), ('Stone', 'Julia')]
def test_iterable_input(self): def generator(): yield ['name', 'surname'] yield ['Victor', 'Carouso'] yield ['Emily', 'Harknett'] reader = casanova.reader(generator()) assert list(reader.cells('name')) == ['Victor', 'Emily']
def test_cells_no_headers(self): with open('./test/resources/no_headers.csv') as f: reader = casanova.reader(f, no_headers=True) with pytest.raises(MissingColumnError): reader.cells(4) names = [name for name in reader.cells(0)] assert names == ['John', 'Mary', 'Julia']
def test_ignore_null_bytes(self): with open('./test/resources/with_null_bytes.csv') as f: reader = casanova.reader(f, ignore_null_bytes=True) rows = list(reader) assert rows == [ ['John', 'Zero'], ['Mary', 'La Croix'] ]
def test_wrap(self): with open('./test/resources/people.csv') as f: reader = casanova.reader(f) for row in reader: wrapped = reader.wrap(row) assert isinstance(wrapped, DictLikeRow) assert wrapped['name'] == row[0] assert wrapped.surname == row[1]
def test_tricky(self): with open('./test/resources/tricky_reverse.csv') as f: reader = casanova.reader(f) rows = list(reader) with open('./test/resources/tricky_reverse.csv') as f: reverse_reader = casanova.reverse_reader(f) rows_read_in_reverse = list(reversed(list(reverse_reader))) assert rows_read_in_reverse == rows
def test_prebuffer(self): with open('./test/resources/people.csv') as f: reader = casanova.reader(f, prebuffer_bytes=1024) assert list(reader.cells('surname')) == ['Matthews', 'Sue', 'Stone'] assert reader.total == 3 with open('./test/resources/people.csv') as f: reader = casanova.reader(f, prebuffer_bytes=2) assert list(reader.cells('surname')) == ['Matthews', 'Sue', 'Stone'] assert reader.total is None with open('./test/resources/people.csv') as f: reader = casanova.reader(f, prebuffer_bytes=2) for surname in reader.cells('surname'): assert surname == 'Matthews' break assert list(reader.cells('surname')) == ['Sue', 'Stone']
def test_global_defaults(self): with pytest.raises(TypeError): set_default_prebuffer_bytes([]) set_default_prebuffer_bytes(1024) with open('./test/resources/people.csv') as f: reader = casanova.reader(f) assert list(reader.cells('surname')) == ['Matthews', 'Sue', 'Stone'] assert reader.total == 3 set_default_prebuffer_bytes(None)
def bench(path, column, headers=True, skip_std=True): if not skip_std: with Timer('csv.reader'): with open(path) as f: for row in csv.reader(f): a = row[0] if headers: with Timer('csv.DictReader'): with open(path) as f: for row in csv.DictReader(f): a = row[column] with Timer('casanova.reader: basic'): with open(path) as f: reader = casanova.reader(f, no_headers=not headers) for row in reader: a = row[reader.headers[column]] with Timer('casanova.reader: cached pos'): with open(path) as f: reader = casanova.reader(f, no_headers=not headers) pos = reader.headers[column] for row in reader: a = row[pos] with Timer('casanova.reader: cells'): with open(path) as f: reader = casanova.reader(f, no_headers=not headers) for value in reader.cells(column): a = value with Timer('casanova.reader: cells with_rows'): with open(path) as f: reader = casanova.reader(f, no_headers=not headers) for row, value in reader.cells(column, with_rows=True): a = value
def extract_users_urls_medias_from_csv(f, trie, of=sys.stdout, total=None, filter_fr=False, min_date=None): headers = [ 'tweet_id', 'user_screen_name', 'user_id', 'normalized_url', 'domain_name', 'webentity', 'datetime', 'is_retweet', 'nb_followers' ] writer = csv.writer(of) writer.writerow(headers) casa = casanova.reader(f) try: for row, (tid, uname, uid, dtime, rtid, nbfols, links, lang) in tqdm(enumerate( casa.cells([ 'id', 'from_user_name', 'from_user_id', 'created_at', 'retweeted_id', 'from_user_followercount', 'links', 'lang' ])), total=total): if filter_fr and lang != 'fr': continue if min_date and dtime < min_date: continue is_rt = (rtid != '') for url in links.split('|'): url = url.strip() if not url: continue webentity = trie.match(url) normalized = normalize_url(url) domain = normalized.split("/")[0] if not webentity: #if "twitter.com/" not in url and "youtube.com" not in url: # print('WARNING: url unmatched on row #%s: %s' % (row, domain), file=sys.stderr) continue writer.writerow([ tid, uname, uid, normalized, domain, webentity, dtime, is_rt, nbfols ]) except Exception as e: print( 'ERROR while processing row #%s (https://twitter.com/%s/statuses/%s)' % (row, uname, tid), file=sys.stderr) raise (e)
def scrape_action(namespace): output_file = open_output_file(namespace.output) # Parsing scraper definition try: scraper = load_definition(namespace.scraper) except TypeError: die(['Unknown scraper format.', 'Expecting a JSON or YAML file.']) except: die('Invalid scraper file.') if namespace.format == 'csv': output_headers = headers_from_definition(scraper) output_writer = csv.DictWriter(output_file, fieldnames=output_headers) output_writer.writeheader() else: output_writer = ndjson.writer(output_file) loading_bar = tqdm(desc='Scraping pages', total=namespace.total, dynamic_ncols=True, unit=' pages') loading_bar.set_postfix(p=namespace.processes) if namespace.glob is not None: files = create_glob_iterator(namespace, scraper) else: reader = casanova.reader(namespace.report) files = create_report_iterator(namespace, reader, scraper, loading_bar) with Pool(namespace.processes) as pool: for error, items in pool.imap_unordered(worker, files): loading_bar.update() if not isinstance(items, list): items = [items] for item in items: if not isinstance(item, dict): item = {'value': item} output_writer.writerow(item) output_file.close()
def filter_and_index_tweets(f, period="week", total=None, filter_threads=True, filter_retweets=True, filter_quotes=False): index = defaultdict(list) casa = casanova.reader(f) id_pos = casa.pos.id created_at_pos = casa.pos.created_at threads_pos = casa.pos.collected_via_thread_only RT_pos = casa.pos.retweeted_id quote_pos = casa.pos.quoted_id try: sampler = time_reducers[period] except KeyError: print("ERROR: no reducer for a period named %s" % period, file=sys.stderr) exit(1) try: for i, row in enumerate(tqdm(casa, total=total)): if (filter_threads and row[threads_pos] == "1") or \ (filter_retweets and row[RT_pos]) or \ (filter_quotes and row[quote_pos]): continue timeperiod = sampler(row[created_at_pos]) index[timeperiod].append(i) except Exception as e: print("ERROR indexing while working on row #%s:" % i, row, file=sys.stderr) raise (e) return index
def test_invalid_identifier_headers(self): with casanova.reader('./test/resources/invalid_headers.csv') as reader: assert list(reader.cells('Person\'s name')) == ['John', 'Mary', 'Julia']
def action(namespace, output_file): # Do we need to resume? need_to_resume = False if getattr(namespace, 'resume', False): need_to_resume = True if namespace.output is None: die( 'Cannot --resume without knowing the output (use -o/--output rather stdout).', ) if namespace.sort_by != 'date': die('Cannot --resume if --sort_by is not `date`.') if namespace.format != 'csv': die('Cannot --resume jsonl format yet.') with open(namespace.output, 'r', encoding='utf-8') as f: resume_reader = casanova.reader(f) last_cell = None resume_loader = tqdm(desc='Resuming', unit=' lines') for cell in resume_reader.cells('datetime'): resume_loader.update() last_cell = cell resume_loader.close() if last_cell is not None: last_date = last_cell.replace(' ', 'T') namespace.end_date = last_date print_err('Resuming from: %s' % last_date) if callable(announce): print_err(announce(namespace)) # Loading bar loading_bar = tqdm(desc='Fetching %s' % item_name, dynamic_ncols=True, unit=' %s' % item_name, total=namespace.limit) if namespace.format == 'csv': writer = csv.writer(output_file) if not need_to_resume: writer.writerow( csv_headers(namespace) if callable(csv_headers ) else csv_headers) else: writer = ndjson.writer(output_file) client = CrowdTangleAPIClient(namespace.token, rate_limit=namespace.rate_limit) args = [] if callable(get_args): args = get_args(namespace) def before_sleep(retry_state): exc = retry_state.outcome.exception() if isinstance(exc, CrowdTangleRateLimitExceeded): reason = 'Call failed because of rate limit!' elif isinstance(exc, CrowdTangleInvalidJSONError): reason = 'Call failed because of invalid JSON payload!' else: reason = 'Call failed because of server timeout!' tqdm.write( '%s\nWill wait for %s before attempting again.' % (reason, prettyprint_seconds(retry_state.idle_for, granularity=2)), file=sys.stderr) create_iterator = getattr(client, method_name) iterator = create_iterator( *args, partition_strategy=getattr(namespace, 'partition_strategy', None), limit=namespace.limit, format='csv_row' if namespace.format == 'csv' else 'raw', per_call=True, detailed=True, namespace=namespace, before_sleep=before_sleep) try: for details, items in iterator: if details is not None: loading_bar.set_postfix(**details) for item in items: writer.writerow(item) loading_bar.update(len(items)) except CrowdTangleInvalidTokenError: loading_bar.close() die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ]) loading_bar.close()
def action(namespace, output_file): # Do we need to resume? need_to_resume = False if getattr(namespace, "resume", False): need_to_resume = True if namespace.output is None: die( "Cannot --resume without knowing the output (use -o/--output rather stdout).", ) if namespace.sort_by != "date": die("Cannot --resume if --sort_by is not `date`.") if namespace.format != "csv": die("Cannot --resume jsonl format yet.") with open(namespace.output, "r") as f: resume_reader = casanova.reader(f) last_cell = None resume_loader = tqdm(desc="Resuming", unit=" lines") for cell in resume_reader.cells("datetime"): resume_loader.update() last_cell = cell resume_loader.close() if last_cell is not None: last_date = last_cell.replace(" ", "T") namespace.end_date = last_date print_err("Resuming from: %s" % last_date) # Loading bar loading_bar = tqdm( desc="Fetching %s" % item_name, dynamic_ncols=True, unit=" %s" % item_name, total=namespace.limit, ) if namespace.format == "csv": writer = csv.writer(output_file) if not need_to_resume: writer.writerow( csv_headers(namespace) if callable(csv_headers ) else csv_headers) else: writer = ndjson.writer(output_file) client = CrowdTangleClient(namespace.token, rate_limit=namespace.rate_limit) args = [] if callable(get_args): args = get_args(namespace) create_iterator = getattr(client, method_name) iterator = create_iterator( *args, partition_strategy=getattr(namespace, "partition_strategy", None), limit=namespace.limit, format="csv_row" if namespace.format == "csv" else "raw", per_call=True, detailed=True, namespace=namespace) try: for details, items in iterator: if details is not None: loading_bar.set_postfix(**details) for item in items: writer.writerow(item) loading_bar.update(len(items)) except CrowdTangleInvalidTokenError: loading_bar.close() die([ "Your API token is invalid.", "Check that you indicated a valid one using the `--token` argument.", ]) loading_bar.close()
def test_dialect(self): with open('./test/resources/semicolons.csv') as f: reader = casanova.reader(f, delimiter=';') assert [row[0] for row in reader] == ['Rose', 'Luke']
from casanova import reader, enricher from collections import defaultdict followers = set() followee_list = defaultdict(list) # Only the 2000 selected followers with open("2000_followers_graines.csv") as h: filereader = reader(h) for row, follower_id in filereader.cells('follower_id', with_rows=True): followers.add(follower_id) with open("followers_graines_version_2021_09_21.csv") as g: filereader = reader(g) twitter_handle_pos = filereader.headers['twitter_handle'] for row, follower_id in filereader.cells('follower_id', with_rows=True): if follower_id in followers: followee_list[follower_id].append(row[twitter_handle_pos]) with open("2000_followers_graines.csv") as f, \ open("2000_followers_graines_version_2021_09_21.csv", "w") as of: file_enricher = enricher( f, of, add=['count_graines_in_friends', 'graines_in_friends']) for row, follower_id in file_enricher.cells('follower_id', with_rows=True): nb = len(followee_list[follower_id]) liste = "|".join(followee_list[follower_id])
from casanova import reader, enricher from collections import defaultdict friends = set() friends_list = defaultdict(list) # Only the 2000 selected followers with open("friends_graines.csv") as h: filereader = reader(h) twitter_handle_pos = filereader.headers['twitter_handle'] for row, friend_id in filereader.cells('friend_id', with_rows=True): friends.add(friend_id) friends_list[friend_id].append(row[twitter_handle_pos]) with open("2000_followers_graines_version_2021_09_21.csv") as f, \ open("2000_followers_graines_version_2021_10_19.csv", "w") as of: file_enricher = enricher( f, of, add=['count_graines_in_followers', 'graines_in_followers']) for row, friend_id in file_enricher.cells('follower_id', with_rows=True): if friend_id in friends: nb = len(friends_list[friend_id]) liste = "|".join(friends_list[friend_id]) file_enricher.writerow(row, [nb, liste]) else: nb = 0 liste = "" file_enricher.writerow(row, [nb, liste])
def test_path(self): reader = casanova.reader('./test/resources/people.csv') assert list(reader.cells('name')) == ['John', 'Mary', 'Julia'] reader.close()
def test_context(self): with casanova.reader('./test/resources/people.csv') as reader: assert list(reader.cells('name')) == ['John', 'Mary', 'Julia']
def test_bom(self): with open('./test/resources/bom.csv', encoding='utf-8') as f: reader = casanova.reader(f) assert reader.fieldnames == ['name', 'color'] assert 'name' in reader.headers