Example #1
0
    def _execute_batch(self, batch: List[UpdateFromGitHubAPI]):
        """Process a single batch of updates."""
        tasks = map(run_command.s, batch)
        results = group(tasks).apply_async().get(interval=1)

        updated = [r for r in results if isinstance(r, RepositoryUpdated)]
        missing = [r for r in results if isinstance(r, RepositoryNotFound)]

        # This should eventually be moved to an event listener
        log.info('Updating: {}, Missing: {}'.format(len(updated), len(missing)))
        self._handle_updates(updated)
        self._handle_missing(missing)
Example #2
0
    def handle(self, cmd: CalculateImageComplexityScores) \
            -> ImageComplexityScoresCalculated:
        """Use Celery to calculate the image complexity scores concurrently."""
        filenames = self._get_filenames(cmd.path, cmd.pattern)

        rv = []

        batch_size = 100
        log.info("Processing {} images".format(len(filenames)))
        for batch in batched(filenames, batch_size):
            jobs = map(run_command.s, map(CalculateImageComplexityScore, batch))
            result = group(jobs).apply_async()
            rv += result.get(interval=1)
        self._to_csv(cmd.destination, rv)

        return ImageComplexityScoresCalculated(cmd.destination)
Example #3
0
    def handle(self, cmd: ExecuteMahoutRecommender):
        model = MODELS.get(cmd.model)

        source = abspath(join(RATINGS_PATH, model.source))
        destination = abspath(join(EXPORT_PATH, model.destination))

        log.info('Running Mahout')
        run = ["mvn", "exec:java", "-DbatchSize=100",
               "-DmodelID={}".format(model.id),
               "-Dsrc=" + source,
               "-Dout=" + destination]
        subprocess.call(run, cwd="../growser-mahout/")

        Recommendation.query.filter(
            Recommendation.model_id == model.id).delete()

        columns = ['model_id', 'repo_id', 'recommended_repo_id', 'score']
        batch = from_sqlalchemy_table(
            Recommendation.__table__, from_csv(destination), columns)

        for rows in batch.batch_execute(db.engine.raw_connection):
            log.info("Batch complete: {}".format(rows))

        return RecommendationsUpdated(model.id, batch)
Example #4
0
def run_recommendations(ratings: str, output: str, num_repos: int):
    ratings = fetch_ratings(ratings, num_repos)

    log.info("Creating co-occurrence matrix (A'A)")
    coo = ratings.dot(ratings.T)

    log.info("Log-likelihood similarity")
    _recommendations(4, ratings.shape[1], ratings.index, coo, score_llr,
                     'co-occurrence.log-likelihood')

    log.info("Jaccard similarity")
    _recommendations(6, ratings.shape[1], ratings.index, coo, score_jaccard,
                     'co-occurrence.jaccard')
Example #5
0
def fetch_ratings(filename: str, num_repos: int):
    log.info("Loading %s", filename)
    ratings = pd.read_csv(filename, header=None,
                          names=['login_id', 'repo_id', 'rating', 'date'])
    ratings['value'] = 1

    log.info("Filtering ratings")
    top_users = ratings.groupby('login_id')['repo_id'].count() \
        .sort_values(ascending=False) \
        .sample(MAX_LOGINS)

    top_repos = ratings[ratings['login_id'].isin(top_users.index)] \
        .groupby('repo_id')['login_id'].count() \
        .sort_values(ascending=False)[:num_repos]

    rv = ratings[(ratings['login_id'].isin(top_users.index)) &
                 (ratings['repo_id'].isin(top_repos.index))]

    log.info("Creating user/repo matrix")
    df = rv.pivot(index='repo_id', columns='login_id', values='value').fillna(0)

    return df
Example #6
0
def run_command(command):
    log.info("Executing command: {}".format(command))
    bus = commands(app)
    return bus.execute(command)