def count_lines_and_files(paths_lines=None,
                          paths_files=None,
                          line_glob=None,
                          file_glob=None) -> dict:
    """Counts lines and files in the given paths."""

    result = {}

    for path in arg_to_iter(paths_lines):
        path = Path(path).resolve()
        if path.is_dir():
            files = path.glob(line_glob) if line_glob else path.iterdir()
        elif path.is_file():
            files = (path, )
        else:
            files = ()
        for file in files:
            LOGGER.info("Counting lines in <%s>...", file)
            name = os.path.splitext(file.name)[0]
            result[f"lc_{name}"] = count_lines(file)

    for path in arg_to_iter(paths_files):
        path = Path(path).resolve()
        if not path.is_dir():
            continue
        for subdir in path.glob("**"):
            LOGGER.info("Counting files in <%s>...", subdir)
            if path == subdir:
                name = path.name
            else:
                relative = subdir.relative_to(path)
                name = "_".join(relative.parts)
            result[f"fc_{name}"] = count_files(subdir, glob=file_glob)

    return result
 def get_previous_link(self):
     url = super().get_previous_link()
     if url is None:
         return None
     for key, parser in zip(arg_to_iter(self.keys), arg_to_iter(self.parsers)):
         params = ",".join(
             map(str, sorted(_extract_params(self.request, key, parser)))
         )
         url = (
             replace_query_param(url, key, params)
             if params
             else remove_query_param(url, key)
         )
     return url
    def handle(self, *args, **kwargs):
        logging.basicConfig(
            stream=sys.stderr,
            level=logging.DEBUG if kwargs["verbosity"] > 1 else logging.INFO,
            format=
            "%(asctime)s %(levelname)-8.8s [%(name)s:%(lineno)s] %(message)s",
        )

        LOGGER.info(kwargs)

        if kwargs["delete"]:
            LOGGER.info("deleting destination dir <%s>", kwargs["destination"])
            rmtree(kwargs["destination"], ignore_errors=True)

        exclude = tuple(arg_to_iter(kwargs["exclude"]))
        exclude = exclude + (
            re.compile(r"^\."), ) if kwargs["exclude_dot"] else exclude

        LOGGER.info("excluding files: %s", exclude)

        minify(
            src=kwargs["source"],
            dst=kwargs["destination"],
            exclude_files=exclude,
            file_processors=DEFAULT_PROCESSORS,
        )
def _exclude(user=None, ids=None):
    if ids is None:
        return None

    try:
        import turicreate as tc
    except ImportError:
        LOGGER.exception("unable to import <turicreate>")
        return None

    ids = (
        ids
        if isinstance(ids, tc.SArray)
        else tc.SArray(tuple(arg_to_iter(ids)), dtype=int)
    )

    # pylint: disable=len-as-condition
    if ids is None or not len(ids):
        return None

    sframe = tc.SFrame({"bgg_id": ids})
    sframe["bgg_user_name"] = user

    del tc, ids

    return sframe
def _walk_files(path, exclude_files=None):
    exclude_files = tuple(arg_to_iter(exclude_files))
    filter_file = (partial(_filter_file, exclude_files=exclude_files)
                   if exclude_files else None)
    for curr_dir, _, files in os.walk(path):
        for file in filter(filter_file, files):
            yield os.path.join(curr_dir, file)
    def _process_repo(
        self,
        repo,
        directories,
        game_item,
        rating_item,
        game_csv,
        rating_csv,
        recommender_cls=BGGRecommender,
        recommender_dir=None,
        ranking_dir=None,
        max_iterations=100,
        date_str=DATE_TEMPLATE,
        overwrite=False,
        dry_run=False,
    ):
        if isinstance(repo, (str, os.PathLike)):
            repo = Repo(repo)

        LOGGER.info("Processing repository %s...", repo)

        recommender_dir = Path(recommender_dir) if recommender_dir else None
        ranking_dir = Path(ranking_dir) if recommender_dir else None

        if ranking_dir:
            ranking_fac_dir = ranking_dir / self.ranking_types[Ranking.FACTOR]
            ranking_sim_dir = ranking_dir / self.ranking_types[
                Ranking.SIMILARITY]
            if not dry_run:
                ranking_fac_dir.mkdir(parents=True, exist_ok=True)
                ranking_sim_dir.mkdir(parents=True, exist_ok=True)
        else:
            ranking_fac_dir = None
            ranking_sim_dir = None

        for directory in arg_to_iter(directories):
            LOGGER.info("Looking for all versions of <%s>...", directory)
            for commit in repo.iter_commits(paths=directory):
                try:
                    _process_commit(
                        commit=commit,
                        directory=directory,
                        recommender_cls=recommender_cls,
                        recommender_dir=recommender_dir,
                        ranking_fac_dir=ranking_fac_dir,
                        ranking_sim_dir=ranking_sim_dir,
                        game_item=game_item,
                        rating_item=rating_item,
                        game_csv=game_csv,
                        rating_csv=rating_csv,
                        max_iterations=max_iterations,
                        date_str=date_str,
                        overwrite=overwrite,
                        dry_run=dry_run,
                    )
                except Exception:
                    LOGGER.warning(
                        "There was an error processing commit <%s>, skipping...",
                        commit)
def _filter_file(file, exclude_files=None):
    for exclude in arg_to_iter(exclude_files):
        if isinstance(exclude, str):
            if file == exclude:
                return False
        elif exclude.match(file):
            return False
    return True
def _light_games(bgg_ids=None):
    # pylint: disable=no-member
    games = (
        Game.objects.all()
        if bgg_ids is None
        else Game.objects.filter(bgg_id__in=arg_to_iter(bgg_ids))
    )
    return games.values("bgg_id", "name", "year", "image_url")
def _extract_params(request, key, parser=None):
    data_values = (
        arg_to_iter(request.data.get(key))
        if isinstance(request.data, dict)
        else arg_to_iter(request.data)
    )
    query_values = arg_to_iter(request.query_params.getlist(key))
    values = _parse_parts(chain(data_values, query_values))

    if not callable(parser):
        yield from values
        return

    values = map(parser, values)
    for value in values:
        if value is not None:
            yield value
def _load_add_data(files, id_field, *fields, in_format=None):
    objs = _load(*arg_to_iter(files), in_format=in_format)
    result = {
        o.get(id_field): {field: o[field]
                          for field in fields if field in o}
        for o in objs
    }
    LOGGER.info("loaded %d data items", len(result))
    return result
Example #11
0
def parse_url(
    url: Union[str, ParseResult, None],
    hostnames: Optional[Iterable[Union[str, Pattern]]] = None,
) -> Optional[ParseResult]:
    """ parse URL and optionally filter for hosts """
    url = urlparse(url) if isinstance(url, str) else url
    hostnames = tuple(arg_to_iter(hostnames))
    return (url if url and url.hostname and url.path and (not hostnames or any(
        _match(url.hostname, hostname) for hostname in hostnames)) else None)
Example #12
0
def validate_url(
    url: Union[str, ParseResult, None],
    hostnames: Optional[Iterable[Union[str, Pattern]]] = None,
    schemes: Optional[Iterable[Union[str, Pattern]]] = None,
) -> Optional[str]:
    """Returns cleaned up URL iff valid with scheme, hostname, and path."""
    url = parse_url(url=url, hostnames=hostnames)
    schemes = frozenset(arg_to_iter(schemes))
    return (url.geturl() if url is not None and url.scheme and
            (not schemes or url.scheme in schemes) else None)
Example #13
0
def dfs_from_repo(repo, directories, files):
    """Load data from Git repo."""

    LOGGER.info("Loading data from %s...", repo)
    for directory, file in product(arg_to_iter(directories),
                                   arg_to_iter(files)):
        path = os.path.join(directory, file)
        LOGGER.info("Looking for all versions of <%s>...", path)
        for commit in repo.iter_commits(paths=path):
            try:
                blob = commit.tree / directory / file
            except Exception:
                LOGGER.exception("Path <%s> not found in commit <%s>...", path,
                                 commit)
                continue

            LOGGER.info(
                'Found <%s> from commit <%s>: "%s" (%s)',
                blob,
                commit,
                commit.message.strip(),
                commit.authored_datetime,
            )

            file_format = format_from_path(blob.name)

            try:
                data_frame = (pd.read_csv(blob.data_stream)
                              if file_format == "csv" else
                              _df_from_jl(blob.data_stream.read().splitlines())
                              if file_format in ("jl", "jsonl") else None)
            except Exception:
                LOGGER.exception("There are a problem loading <%s>...", blob)
                data_frame = None

            if data_frame is not None and not data_frame.empty:
                yield {
                    "data_frame": data_frame,
                    "commit": commit,
                    "blob": blob,
                    "date": commit.authored_datetime,
                }
def _parse_parts(args):
    for arg in arg_to_iter(args):
        if isinstance(arg, str):
            for parsed in arg.split(","):
                parsed = parsed.strip()
                if parsed:
                    yield parsed
        elif isinstance(arg, (list, tuple)):
            yield from _parse_parts(arg)
        else:
            yield arg
def _parse_link_ids(data, regex=LINK_ID_REGEX):
    result = defaultdict(lambda: defaultdict(list))
    for origin, links in data.items():
        _, id_orig = _parse_link_id(origin, regex)
        if id_orig is None:
            continue
        for site, id_dest in map(_parse_link_id, arg_to_iter(links)):
            if site and id_dest is not None:
                result[id_orig][site].append(id_dest)
    LOGGER.info("found links for %d items", len(result))
    return result
def _cp_any_files(dst, tree, files):
    dst_files = []

    for file in arg_to_iter(files):
        blob = tree / file
        dst_file = dst / file
        with dst_file.open("wb") as dst_fp:
            shutil.copyfileobj(blob.data_stream, dst_fp)
        dst_files.append(dst_file)

    return tuple(dst_files)
Example #17
0
def games_in_articles(paths):
    seen = set()
    for path in arg_to_iter(paths):
        path = Path(path).resolve()
        with open(path) as file:
            for line in file:
                for match in regex.finditer(line):
                    bgg_id = parse_int(match.group(1))
                    name = match.group(2)
                    if bgg_id and bgg_id not in seen:
                        seen.add(bgg_id)
                        yield bgg_id, name, path
Example #18
0
def _process_df(data_frame,
                columns=None,
                required_columns=None,
                target_column=None):
    if data_frame is None or data_frame.empty:
        LOGGER.error("DataFrame is empty")
        return None

    columns = clear_list(arg_to_iter(columns))
    required_columns = clear_list(arg_to_iter(required_columns)) or columns
    columns = clear_list(columns + required_columns)

    if not columns:
        LOGGER.error("No columns given")
        return None

    missing_columns = [
        column for column in required_columns if column not in data_frame
    ]
    if missing_columns:
        LOGGER.error("DataFrame does not contain the expected columns %s",
                     missing_columns)
        return None

    for column in columns:
        if column not in data_frame:
            data_frame[column] = None

    target_column = target_column or columns[0]
    return (data_frame[columns][data_frame[target_column].notna()].sort_values(
        target_column).rename(columns={
            "bayes_rating": "score"
        }).astype({
            "rank": int,
            "bgg_id": int
        }))
def _create_secondary_instances(
        model,
        secondary,
        items,
        models_order=(),
        batch_size=None,
        dry_run=False,
        **kwargs,
):
    instances = _make_secondary_instances(
        model=model,
        secondary=secondary,
        items=items,
        **kwargs,
    )
    del items
    batches = batchify(instances, batch_size) if batch_size else (instances, )
    del instances
    models_order = tuple(arg_to_iter(models_order))

    for count, batch in enumerate(batches):
        LOGGER.info("processing batch #%d...", count + 1)

        models = defaultdict(list)
        for instance in batch:
            models[type(instance)].append(instance)
        order = models_order or tuple(models.keys())
        del batch

        for mdl in order:
            instances = models.pop(mdl, ())
            if not dry_run and instances:
                LOGGER.info("creating %d instances of %r", len(instances), mdl)
                mdl.objects.bulk_create(instances)

        if any(models.values()):
            LOGGER.warning(
                "some models have not been processed properly: %r",
                tuple(models.keys()),
            )

        del models

    del batches

    LOGGER.info("done processing")
    def _excluded_games(self, user, params, include=None, exclude=None):
        params = params or {}
        params.setdefault("exclude_known", True)

        exclude = frozenset(arg_to_iter(exclude)) | frozenset(
            _parse_ints(params.get("exclude"))
        )

        exclude_known = parse_bool(take_first(params.get("exclude_known")))
        exclude_fields = [
            field
            for field in self.collection_fields
            if parse_bool(take_first(params.get(f"exclude_{field}")))
        ]
        exclude_wishlist = parse_int(take_first(params.get("exclude_wishlist")))
        exclude_play_count = parse_int(take_first(params.get("exclude_play_count")))
        exclude_clusters = parse_bool(take_first(params.get("exclude_clusters")))

        try:
            queries = [Q(**{field: True}) for field in exclude_fields]
            if exclude_known and exclude_clusters:
                queries.append(Q(rating__isnull=False))
            if exclude_wishlist:
                queries.append(Q(wishlist__lte=exclude_wishlist))
            if exclude_play_count:
                queries.append(Q(play_count__gte=exclude_play_count))
            if queries:
                query = reduce(or_, queries)
                exclude |= frozenset(
                    User.objects.get(name=user)
                    .collection_set.order_by()
                    .filter(query)
                    .values_list("game_id", flat=True)
                )

        except Exception:
            pass

        return tuple(exclude) if not include else tuple(exclude - include)
 def _create_all_instances(self,
                           path,
                           filter_ids=None,
                           week_day="SUN",
                           types=None):
     types = frozenset(arg_to_iter(types))
     for ranking_type, (
             sub_dir,
             method,
             min_date,
             min_score,
     ) in self.ranking_types.items():
         if not types or ranking_type in types:
             yield from _create_instances(
                 path_dir=os.path.join(path, sub_dir),
                 ranking_type=ranking_type,
                 filter_ids=filter_ids,
                 method=method,
                 week_day=week_day,
                 min_date=min_date,
                 min_score=min_score,
             )
def jl_to_csv(in_path, out_path, columns=None, joiner=","):
    """Convert a JSON lines file into CSV."""

    columns = tuple(arg_to_iter(columns))

    LOGGER.info("Reading JSON lines from <%s> and writing CSV to <%s>...",
                in_path, out_path)

    with open(in_path) as in_file, open(out_path, "w") as out_file:
        if not columns:
            row = next(in_file, None)
            row = _process_row(row, joiner=joiner) if row else {}
            columns = tuple(row.keys())
        else:
            row = None

        rows = map(partial(_process_row, columns=columns, joiner=joiner),
                   in_file)

        writer = DictWriter(out_file, fieldnames=columns)
        writer.writeheader()
        if row:
            writer.writerow(row)
        writer.writerows(rows)
Example #23
0
def _parse_ids(values: Any) -> Generator[Tuple[int, str], None, None]:
    for value in arg_to_iter(values):
        id_, name = _parse_id(value)
        if id_ and name:
            yield id_, name
def _create_references(
    model,
    items,
    foreign=None,
    recursive=None,
    batch_size=None,
    dry_run=False,
):
    foreign = foreign or {}
    foreign = {k: tuple(arg_to_iter(v)) for k, v in foreign.items()}
    foreign = {k: v for k, v in foreign.items() if len(v) == 2}

    recursive = ({r: r
                  for r in arg_to_iter(recursive)}
                 if not isinstance(recursive, dict) else recursive)

    if not foreign and not recursive:
        LOGGER.warning(
            "neither foreign nor recursive references given, got nothing to do..."
        )
        return

    LOGGER.info("creating foreign references: %r", foreign)
    LOGGER.info("creating recursive references: %r", recursive)

    count = -1
    foreign_values = {f[0]: defaultdict(set) for f in foreign.values()}
    updates = {}

    for count, item in enumerate(items):
        update = defaultdict(list)

        for field, (fmodel, _) in foreign.items():
            for value in filter(
                    None, map(_parse_value_id, arg_to_iter(item.get(field)))):
                id_ = value.get("id")
                value = value.get("value")
                if id_ and value:
                    foreign_values[fmodel][id_].add(value)
                    update[field].append(id_)

        for rec_from, rec_to in recursive.items():
            rec = {parse_int(r) for r in arg_to_iter(item.get(rec_from)) if r}
            rec = (sorted(
                model.objects.filter(pk__in=rec).values_list(
                    "pk", flat=True).distinct()) if rec else None)
            if rec:
                update[rec_to] = rec

        pkey = parse_int(item.get(model._meta.pk.name))
        if pkey and any(update.values()):
            updates[pkey] = update

        if (count + 1) % 1000 == 0:
            LOGGER.info("processed %d items so far", count + 1)

    del items, recursive

    LOGGER.info("processed %d items in total", count + 1)

    for fmodel, value_field in frozenset(foreign.values()):
        id_field = fmodel._meta.pk.name
        LOGGER.info("found %d items for model %r to create",
                    len(foreign_values[fmodel]), fmodel)
        values = ({
            id_field: k,
            value_field: take_first(v)
        } for k, v in foreign_values[fmodel].items() if k and v)
        _create_from_items(
            model=fmodel,
            items=values,
            batch_size=batch_size,
            dry_run=dry_run,
        )

    del foreign, foreign_values

    LOGGER.info("found %d items for model %r to update", len(updates), model)

    batches = (batchify(updates.items(), batch_size) if batch_size else
               (updates.items(), ))

    for count, batch in enumerate(batches):
        LOGGER.info("processing batch #%d...", count + 1)
        if not dry_run:
            with atomic():
                for pkey, update in batch:
                    try:
                        instance = model.objects.get(pk=pkey)
                        for field, values in update.items():
                            getattr(instance, field).set(values)
                        instance.save()
                    except Exception:
                        LOGGER.exception(
                            "an error ocurred when updating <%s> with %r",
                            pkey,
                            update,
                        )

    del batches, updates

    LOGGER.info("done updating")