コード例 #1
0
    def test_multiplexing(self):
        with open('./test/resources/multiplex.csv') as f:
            reader = casanova.reader(f, multiplex=('colors', '|'))

            rows = list(reader)

            assert rows == [
                ['John', 'blue'],
                ['John', 'yellow'],
                ['John', 'orange'],
                ['Mary', 'purple'],
                ['Mary', 'blue'],
                ['Eustache', ''],
                ['Lizbeth', 'cyan']
            ]

        with open('./test/resources/multiplex.csv') as f:
            reader = casanova.reader(f, multiplex=('colors', '|', 'color?'))

            cells = list(reader.cells('color?'))

            assert cells == ['blue', 'yellow', 'orange', 'purple', 'blue', '', 'cyan']

        with open('./test/resources/multiplex.csv') as f:
            reader = casanova.reader(f, multiplex=('colors', '|'), prebuffer_bytes=1024)

            assert reader.total == 7
コード例 #2
0
ファイル: url_join.py プロジェクト: AleksiKnuutila/minet-fork
def url_join_action(namespace):
    right_reader = casanova.reader(namespace.file2)
    left_reader = casanova.reader(namespace.file1, namespace.output)

    output_file = open_output_file(namespace.output)
    output_writer = csv.writer(output_file)

    left_headers = left_reader.fieldnames
    left_indices = None

    if namespace.select is not None:
        selected = namespace.select.split(',')
        left_headers = [h for h in left_headers if h in selected]
        left_indices = collect_column_indices(left_reader.pos, left_headers)

    empty = [''] * len(left_headers)

    output_writer.writerow(right_reader.fieldnames + left_headers)

    loading_bar = tqdm(desc='Indexing left file',
                       dynamic_ncols=True,
                       unit=' lines')

    # First step is to index left file
    trie = NormalizedLRUTrie(strip_trailing_slash=True)

    for row, url in left_reader.cells(namespace.column1, with_rows=True):
        url = url.strip()

        if left_indices is not None:
            row = [row[i] for i in left_indices]

        trie.set(url, row)

        loading_bar.update()

    loading_bar.close()

    loading_bar = tqdm(desc='Matching right file',
                       dynamic_ncols=True,
                       unit=' lines')

    for row, url in right_reader.cells(namespace.column2, with_rows=True):
        url = url.strip()

        match = None

        if url:
            match = trie.match(url)

        loading_bar.update()

        if match is None:
            output_writer.writerow(row + empty)
            continue

        row.extend(match)
        output_writer.writerow(row)

    output_file.close()
コード例 #3
0
    def test_gzip(self):
        with gzip.open('./test/resources/people.csv.gz', 'rt') as f:
            reader = casanova.reader(f)

            names = [name for name in reader.cells('name')]

            assert names == ['John', 'Mary', 'Julia']

        with casanova.reader('./test/resources/people.csv.gz') as reader:
            names = [name for name in reader.cells('name')]

            assert names == ['John', 'Mary', 'Julia']
コード例 #4
0
    def test_exceptions(self):
        with pytest.raises(EmptyFileError):
            casanova.reader(StringIO(''))

        with pytest.raises(TypeError):
            casanova.reader(StringIO('name\nYomgui'), buffer=4.5)

        with pytest.raises(TypeError):
            casanova.reader(StringIO('name\nYomgui'), buffer=-456)

        with pytest.raises(TypeError, match='multiplex'):
            casanova.reader(StringIO('name\nYomgui'), multiplex=(45, 'test'))

        with pytest.raises(MissingColumnError):
            casanova.reader(StringIO('name\nYomgui'), multiplex=('surname', 'test'))
コード例 #5
0
    def test_enumerate(self):
        with open('./test/resources/people.csv') as f:
            reader = casanova.reader(f)

            indices = [i for i, row in reader.enumerate()]

            assert indices == list(range(3))
コード例 #6
0
def sample_tweets(f, index, outdir, total=None, samples_sizes=[100]):
    casa = casanova.reader(f)
    samples_indexes = defaultdict(dict)
    outfiles = {}
    for siz in samples_sizes:
        for period, indexes in sorted(index.items()):
            lentweets = len(indexes)
            ntweets = min(siz, lentweets)
            outf = os.path.join(outdir, "%s_sample_%s.csv" % (period, siz))
            print("-", period, ":", lentweets, "filtered tweets to", ntweets,
                  "->", outf)
            outfiles[outf] = {"file": open(outf, "w")}
            outfiles[outf]["writer"] = csv.writer(outfiles[outf]["file"])
            outfiles[outf]["writer"].writerow(casa.fieldnames)
            for i in sample(indexes, ntweets):
                samples_indexes[siz][i] = outfiles[outf]["writer"]

    try:
        for i, row in enumerate(tqdm(casa, total=total)):
            for siz, idx in samples_indexes.items():
                if i in idx:
                    idx[i].writerow(row)

    except Exception as e:
        print("ERROR sampling while working on row #%s:" % i,
              row,
              file=sys.stderr)
        raise (e)

    for outf in outfiles:
        outfiles[outf]["file"].close()
コード例 #7
0
    def test_basics(self):
        with open('./test/resources/people.csv') as f:
            reader = casanova.reader(f)

            assert reader.row_len == 2

            assert reader.headers.name == 0
            assert reader.headers.surname == 1

            assert 'name' in reader.headers
            assert 'whatever' not in reader.headers

            assert reader.headers['name'] == 0
            assert reader.headers['surname'] == 1

            assert reader.headers.get('name') == 0
            assert reader.headers.get('whatever') is None
            assert reader.headers.get('whatever', 1) == 1

            assert len(reader.headers) == 2
            assert reader.fieldnames == ['name', 'surname']

            assert list(reader.headers) == [('name', 0), ('surname', 1)]
            assert dict(list(reader.headers)) == {'name': 0, 'surname': 1}
            assert reader.headers.as_dict() == {'name': 0, 'surname': 1}

            with pytest.raises(KeyError):
                reader.headers['whatever']

            surnames = [row[reader.headers.surname] for row in reader]
            assert surnames == ['Matthews', 'Sue', 'Stone']
コード例 #8
0
def url_join_action(cli_args):
    left_reader = casanova.reader(cli_args.file1)
    left_headers = left_reader.fieldnames
    left_idx = None

    if cli_args.select:
        left_idx = left_reader.pos.collect(cli_args.select)
        left_headers = list(cli_args.select)

    # Applying column prefix now
    left_headers = [cli_args.match_column_prefix + h for h in left_headers]

    right_enricher = casanova.enricher(cli_args.file2,
                                       cli_args.output,
                                       add=left_headers)

    loading_bar = LoadingBar(desc='Indexing left file', unit='line')

    # First step is to index left file
    trie = NormalizedLRUTrie()

    for row, cell in left_reader.cells(cli_args.column1, with_rows=True):
        loading_bar.update()

        if left_idx is not None:
            row = [row[i] for i in left_idx]

        urls = [cell]

        if cli_args.separator is not None:
            urls = cell.split(cli_args.separator)

        for url in urls:
            url = url.strip()

            # NOTE: should we filter invalid urls here?
            if url:
                trie.set(url, row)

    loading_bar.close()

    loading_bar = LoadingBar(desc='Matching right file', unit='line')

    for row, url in right_enricher.cells(cli_args.column2, with_rows=True):
        loading_bar.update()

        url = url.strip()

        match = None

        # NOTE: should we filter invalid urls here?
        if url:
            match = trie.match(url)

        if match is None:
            right_enricher.writerow(row)
            continue

        right_enricher.writerow(row, match)
コード例 #9
0
    def test_no_headers(self):
        with open('./test/resources/no_headers.csv') as f:
            reader = casanova.reader(f, no_headers=True)

            assert reader.fieldnames is None

            surnames = [row[1] for row in reader]
            assert surnames == ['Matthews', 'Sue', 'Stone']
コード例 #10
0
    def test_cells(self):
        with open('./test/resources/people.csv') as f:
            reader = casanova.reader(f)

            with pytest.raises(MissingColumnError):
                reader.cells('whatever')

            names = [name for name in reader.cells('name')]

            assert names == ['John', 'Mary', 'Julia']

        with open('./test/resources/people.csv') as f:
            reader = casanova.reader(f)

            names = [(row[1], name) for row, name in reader.cells('name', with_rows=True)]

            assert names == [('Matthews', 'John'), ('Sue', 'Mary'), ('Stone', 'Julia')]
コード例 #11
0
    def test_iterable_input(self):

        def generator():
            yield ['name', 'surname']
            yield ['Victor', 'Carouso']
            yield ['Emily', 'Harknett']

        reader = casanova.reader(generator())

        assert list(reader.cells('name')) == ['Victor', 'Emily']
コード例 #12
0
    def test_cells_no_headers(self):
        with open('./test/resources/no_headers.csv') as f:
            reader = casanova.reader(f, no_headers=True)

            with pytest.raises(MissingColumnError):
                reader.cells(4)

            names = [name for name in reader.cells(0)]

            assert names == ['John', 'Mary', 'Julia']
コード例 #13
0
    def test_ignore_null_bytes(self):
        with open('./test/resources/with_null_bytes.csv') as f:
            reader = casanova.reader(f, ignore_null_bytes=True)

            rows = list(reader)

            assert rows == [
                ['John', 'Zero'],
                ['Mary', 'La Croix']
            ]
コード例 #14
0
    def test_wrap(self):
        with open('./test/resources/people.csv') as f:
            reader = casanova.reader(f)

            for row in reader:
                wrapped = reader.wrap(row)

                assert isinstance(wrapped, DictLikeRow)
                assert wrapped['name'] == row[0]
                assert wrapped.surname == row[1]
コード例 #15
0
    def test_tricky(self):
        with open('./test/resources/tricky_reverse.csv') as f:
            reader = casanova.reader(f)
            rows = list(reader)

        with open('./test/resources/tricky_reverse.csv') as f:
            reverse_reader = casanova.reverse_reader(f)
            rows_read_in_reverse = list(reversed(list(reverse_reader)))

            assert rows_read_in_reverse == rows
コード例 #16
0
    def test_prebuffer(self):
        with open('./test/resources/people.csv') as f:
            reader = casanova.reader(f, prebuffer_bytes=1024)

            assert list(reader.cells('surname')) == ['Matthews', 'Sue', 'Stone']
            assert reader.total == 3

        with open('./test/resources/people.csv') as f:
            reader = casanova.reader(f, prebuffer_bytes=2)

            assert list(reader.cells('surname')) == ['Matthews', 'Sue', 'Stone']
            assert reader.total is None

        with open('./test/resources/people.csv') as f:
            reader = casanova.reader(f, prebuffer_bytes=2)

            for surname in reader.cells('surname'):
                assert surname == 'Matthews'
                break

            assert list(reader.cells('surname')) == ['Sue', 'Stone']
コード例 #17
0
    def test_global_defaults(self):
        with pytest.raises(TypeError):
            set_default_prebuffer_bytes([])

        set_default_prebuffer_bytes(1024)

        with open('./test/resources/people.csv') as f:
            reader = casanova.reader(f)

            assert list(reader.cells('surname')) == ['Matthews', 'Sue', 'Stone']
            assert reader.total == 3

        set_default_prebuffer_bytes(None)
コード例 #18
0
def bench(path, column, headers=True, skip_std=True):
    if not skip_std:
        with Timer('csv.reader'):
            with open(path) as f:
                for row in csv.reader(f):
                    a = row[0]

        if headers:
            with Timer('csv.DictReader'):
                with open(path) as f:
                    for row in csv.DictReader(f):
                        a = row[column]

    with Timer('casanova.reader: basic'):
        with open(path) as f:
            reader = casanova.reader(f, no_headers=not headers)
            for row in reader:
                a = row[reader.headers[column]]

    with Timer('casanova.reader: cached pos'):
        with open(path) as f:
            reader = casanova.reader(f, no_headers=not headers)
            pos = reader.headers[column]

            for row in reader:
                a = row[pos]

    with Timer('casanova.reader: cells'):
        with open(path) as f:
            reader = casanova.reader(f, no_headers=not headers)
            for value in reader.cells(column):
                a = value

    with Timer('casanova.reader: cells with_rows'):
        with open(path) as f:
            reader = casanova.reader(f, no_headers=not headers)
            for row, value in reader.cells(column, with_rows=True):
                a = value
コード例 #19
0
def extract_users_urls_medias_from_csv(f,
                                       trie,
                                       of=sys.stdout,
                                       total=None,
                                       filter_fr=False,
                                       min_date=None):
    headers = [
        'tweet_id', 'user_screen_name', 'user_id', 'normalized_url',
        'domain_name', 'webentity', 'datetime', 'is_retweet', 'nb_followers'
    ]
    writer = csv.writer(of)
    writer.writerow(headers)
    casa = casanova.reader(f)
    try:
        for row, (tid, uname, uid, dtime, rtid, nbfols, links,
                  lang) in tqdm(enumerate(
                      casa.cells([
                          'id', 'from_user_name', 'from_user_id', 'created_at',
                          'retweeted_id', 'from_user_followercount', 'links',
                          'lang'
                      ])),
                                total=total):
            if filter_fr and lang != 'fr':
                continue
            if min_date and dtime < min_date:
                continue
            is_rt = (rtid != '')
            for url in links.split('|'):
                url = url.strip()
                if not url:
                    continue
                webentity = trie.match(url)
                normalized = normalize_url(url)
                domain = normalized.split("/")[0]
                if not webentity:
                    #if "twitter.com/" not in url and "youtube.com" not in url:
                    #    print('WARNING: url unmatched on row #%s: %s' % (row, domain), file=sys.stderr)
                    continue
                writer.writerow([
                    tid, uname, uid, normalized, domain, webentity, dtime,
                    is_rt, nbfols
                ])

    except Exception as e:
        print(
            'ERROR while processing row #%s (https://twitter.com/%s/statuses/%s)'
            % (row, uname, tid),
            file=sys.stderr)
        raise (e)
コード例 #20
0
ファイル: scrape.py プロジェクト: rangsutu88/minet
def scrape_action(namespace):

    output_file = open_output_file(namespace.output)

    # Parsing scraper definition
    try:
        scraper = load_definition(namespace.scraper)
    except TypeError:
        die(['Unknown scraper format.', 'Expecting a JSON or YAML file.'])
    except:
        die('Invalid scraper file.')

    if namespace.format == 'csv':
        output_headers = headers_from_definition(scraper)
        output_writer = csv.DictWriter(output_file, fieldnames=output_headers)
        output_writer.writeheader()
    else:
        output_writer = ndjson.writer(output_file)

    loading_bar = tqdm(desc='Scraping pages',
                       total=namespace.total,
                       dynamic_ncols=True,
                       unit=' pages')

    loading_bar.set_postfix(p=namespace.processes)

    if namespace.glob is not None:
        files = create_glob_iterator(namespace, scraper)
    else:
        reader = casanova.reader(namespace.report)
        files = create_report_iterator(namespace, reader, scraper, loading_bar)

    with Pool(namespace.processes) as pool:
        for error, items in pool.imap_unordered(worker, files):
            loading_bar.update()

            if not isinstance(items, list):
                items = [items]

            for item in items:
                if not isinstance(item, dict):
                    item = {'value': item}

                output_writer.writerow(item)

    output_file.close()
コード例 #21
0
def filter_and_index_tweets(f,
                            period="week",
                            total=None,
                            filter_threads=True,
                            filter_retweets=True,
                            filter_quotes=False):
    index = defaultdict(list)
    casa = casanova.reader(f)
    id_pos = casa.pos.id
    created_at_pos = casa.pos.created_at
    threads_pos = casa.pos.collected_via_thread_only
    RT_pos = casa.pos.retweeted_id
    quote_pos = casa.pos.quoted_id

    try:
        sampler = time_reducers[period]
    except KeyError:
        print("ERROR: no reducer for a period named %s" % period,
              file=sys.stderr)
        exit(1)

    try:
        for i, row in enumerate(tqdm(casa, total=total)):
            if (filter_threads and row[threads_pos] == "1") or \
               (filter_retweets and row[RT_pos]) or \
               (filter_quotes and row[quote_pos]):
                continue
            timeperiod = sampler(row[created_at_pos])
            index[timeperiod].append(i)

    except Exception as e:
        print("ERROR indexing while working on row #%s:" % i,
              row,
              file=sys.stderr)
        raise (e)

    return index
コード例 #22
0
 def test_invalid_identifier_headers(self):
     with casanova.reader('./test/resources/invalid_headers.csv') as reader:
         assert list(reader.cells('Person\'s name')) == ['John', 'Mary', 'Julia']
コード例 #23
0
ファイル: utils.py プロジェクト: lebelgique/minet
    def action(namespace, output_file):

        # Do we need to resume?
        need_to_resume = False

        if getattr(namespace, 'resume', False):
            need_to_resume = True

            if namespace.output is None:
                die(
                    'Cannot --resume without knowing the output (use -o/--output rather stdout).',
                )

            if namespace.sort_by != 'date':
                die('Cannot --resume if --sort_by is not `date`.')

            if namespace.format != 'csv':
                die('Cannot --resume jsonl format yet.')

            with open(namespace.output, 'r', encoding='utf-8') as f:
                resume_reader = casanova.reader(f)

                last_cell = None
                resume_loader = tqdm(desc='Resuming', unit=' lines')

                for cell in resume_reader.cells('datetime'):
                    resume_loader.update()
                    last_cell = cell

                resume_loader.close()

                if last_cell is not None:
                    last_date = last_cell.replace(' ', 'T')
                    namespace.end_date = last_date

                    print_err('Resuming from: %s' % last_date)

        if callable(announce):
            print_err(announce(namespace))

        # Loading bar
        loading_bar = tqdm(desc='Fetching %s' % item_name,
                           dynamic_ncols=True,
                           unit=' %s' % item_name,
                           total=namespace.limit)

        if namespace.format == 'csv':
            writer = csv.writer(output_file)

            if not need_to_resume:
                writer.writerow(
                    csv_headers(namespace) if callable(csv_headers
                                                       ) else csv_headers)
        else:
            writer = ndjson.writer(output_file)

        client = CrowdTangleAPIClient(namespace.token,
                                      rate_limit=namespace.rate_limit)

        args = []

        if callable(get_args):
            args = get_args(namespace)

        def before_sleep(retry_state):
            exc = retry_state.outcome.exception()

            if isinstance(exc, CrowdTangleRateLimitExceeded):
                reason = 'Call failed because of rate limit!'

            elif isinstance(exc, CrowdTangleInvalidJSONError):
                reason = 'Call failed because of invalid JSON payload!'

            else:
                reason = 'Call failed because of server timeout!'

            tqdm.write(
                '%s\nWill wait for %s before attempting again.' %
                (reason,
                 prettyprint_seconds(retry_state.idle_for, granularity=2)),
                file=sys.stderr)

        create_iterator = getattr(client, method_name)
        iterator = create_iterator(
            *args,
            partition_strategy=getattr(namespace, 'partition_strategy', None),
            limit=namespace.limit,
            format='csv_row' if namespace.format == 'csv' else 'raw',
            per_call=True,
            detailed=True,
            namespace=namespace,
            before_sleep=before_sleep)

        try:
            for details, items in iterator:
                if details is not None:
                    loading_bar.set_postfix(**details)

                for item in items:
                    writer.writerow(item)

                loading_bar.update(len(items))

        except CrowdTangleInvalidTokenError:
            loading_bar.close()
            die([
                'Your API token is invalid.',
                'Check that you indicated a valid one using the `--token` argument.'
            ])

        loading_bar.close()
コード例 #24
0
ファイル: utils.py プロジェクト: AleksiKnuutila/minet-fork
    def action(namespace, output_file):

        # Do we need to resume?
        need_to_resume = False

        if getattr(namespace, "resume", False):
            need_to_resume = True

            if namespace.output is None:
                die(
                    "Cannot --resume without knowing the output (use -o/--output rather stdout).",
                )

            if namespace.sort_by != "date":
                die("Cannot --resume if --sort_by is not `date`.")

            if namespace.format != "csv":
                die("Cannot --resume jsonl format yet.")

            with open(namespace.output, "r") as f:
                resume_reader = casanova.reader(f)

                last_cell = None
                resume_loader = tqdm(desc="Resuming", unit=" lines")

                for cell in resume_reader.cells("datetime"):
                    resume_loader.update()
                    last_cell = cell

                resume_loader.close()

                if last_cell is not None:
                    last_date = last_cell.replace(" ", "T")
                    namespace.end_date = last_date

                    print_err("Resuming from: %s" % last_date)

        # Loading bar
        loading_bar = tqdm(
            desc="Fetching %s" % item_name,
            dynamic_ncols=True,
            unit=" %s" % item_name,
            total=namespace.limit,
        )

        if namespace.format == "csv":
            writer = csv.writer(output_file)

            if not need_to_resume:
                writer.writerow(
                    csv_headers(namespace) if callable(csv_headers
                                                       ) else csv_headers)
        else:
            writer = ndjson.writer(output_file)

        client = CrowdTangleClient(namespace.token,
                                   rate_limit=namespace.rate_limit)

        args = []

        if callable(get_args):
            args = get_args(namespace)

        create_iterator = getattr(client, method_name)
        iterator = create_iterator(
            *args,
            partition_strategy=getattr(namespace, "partition_strategy", None),
            limit=namespace.limit,
            format="csv_row" if namespace.format == "csv" else "raw",
            per_call=True,
            detailed=True,
            namespace=namespace)

        try:
            for details, items in iterator:
                if details is not None:
                    loading_bar.set_postfix(**details)

                for item in items:
                    writer.writerow(item)

                loading_bar.update(len(items))

        except CrowdTangleInvalidTokenError:
            loading_bar.close()
            die([
                "Your API token is invalid.",
                "Check that you indicated a valid one using the `--token` argument.",
            ])

        loading_bar.close()
コード例 #25
0
    def test_dialect(self):
        with open('./test/resources/semicolons.csv') as f:
            reader = casanova.reader(f, delimiter=';')

            assert [row[0] for row in reader] == ['Rose', 'Luke']
コード例 #26
0
from casanova import reader, enricher
from collections import defaultdict

followers = set()
followee_list = defaultdict(list)

# Only the 2000 selected followers

with open("2000_followers_graines.csv") as h:
    filereader = reader(h)

    for row, follower_id in filereader.cells('follower_id', with_rows=True):
        followers.add(follower_id)

with open("followers_graines_version_2021_09_21.csv") as g:
    filereader = reader(g)

    twitter_handle_pos = filereader.headers['twitter_handle']

    for row, follower_id in filereader.cells('follower_id', with_rows=True):
        if follower_id in followers:
            followee_list[follower_id].append(row[twitter_handle_pos])

with open("2000_followers_graines.csv") as f, \
    open("2000_followers_graines_version_2021_09_21.csv", "w") as of:
    file_enricher = enricher(
        f, of, add=['count_graines_in_friends', 'graines_in_friends'])

    for row, follower_id in file_enricher.cells('follower_id', with_rows=True):
        nb = len(followee_list[follower_id])
        liste = "|".join(followee_list[follower_id])
コード例 #27
0
from casanova import reader, enricher
from collections import defaultdict

friends = set()
friends_list = defaultdict(list)

# Only the 2000 selected followers

with open("friends_graines.csv") as h:
    filereader = reader(h)

    twitter_handle_pos = filereader.headers['twitter_handle']

    for row, friend_id in filereader.cells('friend_id', with_rows=True):
        friends.add(friend_id)
        friends_list[friend_id].append(row[twitter_handle_pos])

with open("2000_followers_graines_version_2021_09_21.csv") as f, \
    open("2000_followers_graines_version_2021_10_19.csv", "w") as of:
    file_enricher = enricher(
        f, of, add=['count_graines_in_followers', 'graines_in_followers'])

    for row, friend_id in file_enricher.cells('follower_id', with_rows=True):
        if friend_id in friends:
            nb = len(friends_list[friend_id])
            liste = "|".join(friends_list[friend_id])
            file_enricher.writerow(row, [nb, liste])
        else:
            nb = 0
            liste = ""
            file_enricher.writerow(row, [nb, liste])
コード例 #28
0
    def test_path(self):
        reader = casanova.reader('./test/resources/people.csv')

        assert list(reader.cells('name')) == ['John', 'Mary', 'Julia']

        reader.close()
コード例 #29
0
 def test_context(self):
     with casanova.reader('./test/resources/people.csv') as reader:
         assert list(reader.cells('name')) == ['John', 'Mary', 'Julia']
コード例 #30
0
    def test_bom(self):
        with open('./test/resources/bom.csv', encoding='utf-8') as f:
            reader = casanova.reader(f)

            assert reader.fieldnames == ['name', 'color']
            assert 'name' in reader.headers