Ejemplo n.º 1
0
    def predict(self, X, df):
        log_call()

        if self.mode == 'chunked':
            assert self._n_filled == self.n_total
            assert X.shape[0] == self.n_pred

        result = {}
        m = X.shape[0]
        csr = self.similarities_.tocsr()

        ids, dpds = list(df['id']), list(df['downloads_per_day'])
        for index in range(m):
            id_, dpd = ids[index], dpds[index]
            dpd_cutoff = max(self.min_dpd, (dpd / self.min_dpd_ratio))

            left, right = csr.indptr[index], csr.indptr[index + 1]
            indices, data = csr.indices[left:right], csr.data[left:right]

            rec_indices = indices[(-data).argsort()]
            rec_indices = (i for i in rec_indices if self._ids[i] != id_)
            rec_indices = (i for i in rec_indices if self._dpds[i] >= dpd_cutoff)
            rec_indices = islice(rec_indices, self.n_recs)

            recs = [self._ids[i] for i in rec_indices]
            result[id_] = recs

        return result
Ejemplo n.º 2
0
    def _compute_weights(self, df):
        log_call()
        m = df.shape[0]
        t = len(self.vocab_)
        weights = sparse.lil_matrix((m, t))
        index_map = {tag: index for index, tag in enumerate(self.vocab_)}

        weight = self.weights['tags']
        for rowidx in range(m):
            for tag in _parse_tags(df['tags'][rowidx]):
                colidx = index_map[tag]
                idf = self.idfs_[tag]
                weights[rowidx, colidx] = weight * idf

        cv = CountVectorizer(vocabulary=self.vocab_)
        for feature in ('description', 'id'):
            weight = self.weights[feature]
            counts = cv.transform(df[feature])
            for rowidx, colidx in zip(*counts.nonzero()):
                term = self.vocab_[colidx]
                if _is_hackword(term):
                    # IDF alone seems to be working better than TF-IDF, so ignore TF
                    idf = self.idfs_[term]
                    weights[rowidx, colidx] += weight * idf

        return weights
Ejemplo n.º 3
0
def _freshness_vector(X):
    log_call()
    da = X['days_abandoned'].values
    da[np.isnan(da)] = np.nanmean(da)

    m, M = np.min(da), np.max(da)
    da = (da - m) / (M - m)
    return 1 - da
Ejemplo n.º 4
0
def _popularity_vector(X):
    log_call()
    dpd = X['downloads_per_day'].values
    log_dpd = np.log1p(dpd)
    log_dpd[np.isnan(log_dpd)] = np.nanmean(log_dpd)

    m, M = np.min(log_dpd), np.max(log_dpd)
    log_dpd = (log_dpd - m) / (M - m)
    return log_dpd
Ejemplo n.º 5
0
    def _make_etags(self, weights):
        log_call()
        m = weights.shape[0]
        etags_col = pd.Series('', index=np.arange(m))

        nonzero = zip(*weights.nonzero())
        for rowidx, entries in groupby(nonzero, key=lambda entry: entry[0]):
            colidxs = [entry[1] for entry in entries]
            etags = ','.join(self._make_etags_for_row(weights, rowidx,
                                                      colidxs))
            etags_col[rowidx] = etags
        return etags_col
Ejemplo n.º 6
0
def dump_etags(df, fname, include_weights):
    def get_tag(etag):
        tag, _ = etag.split(' ')
        return tag

    log_call()
    m = df.shape[0]
    with open(fname, 'w', encoding='utf-8') as file:
        for index in range(m):
            id_, etags = df['id'][index], df['etags'][index]
            if not include_weights and etags:
                etags = ','.join(map(get_tag, etags.split(',')))
            line = "{}: {}\n".format(id_, etags)
            file.write(line)
Ejemplo n.º 7
0
def _etags_matrix(X, vocab):
    log_call()

    m, t = X.shape[0], len(vocab)
    tag_weights = sparse.lil_matrix((m, t))
    index_map = {tag: index for index, tag in enumerate(vocab)}

    for rowidx, etags in enumerate(X['etags']):
        if etags:
            for etag in etags.split(','):
                tag, weight = etag.split()
                colidx = index_map[tag]
                tag_weights[rowidx, colidx] = np.float64(weight)

    return tag_weights.tocsr()
Ejemplo n.º 8
0
def _compute_idfs(df):
    log_call()
    # IDF (inverse document frequency) formula: log N / n_t
    # N is the number of documents (aka packages)
    # n_t is the number of documents tagged with term t
    m = df.shape[0]  # aka N
    nts = {}
    for index in range(m):
        seen = {}
        for tag in _parse_tags(df['tags'][index]):
            if not seen.get(tag, False):
                nts[tag] = nts.get(tag, 0) + 1
                seen[tag] = True

    log10_m = np.log10(m)
    return {tag: log10_m - np.log10(nt) for tag, nt in nts.items()}
Ejemplo n.º 9
0
    def _fit(self, X, df, X_pred, df_pred): # pylint: disable=W0613
        log_call()
        assert sparse.isspmatrix_csr(X)

        metrics_and_penalties = [
            (_freshness_vector(df), self.penalties['freshness']),
            (_popularity_vector(df), self.penalties['popularity']),
        ]
        metrics, penalties = zip(*metrics_and_penalties)
        scale_vectors = _apply_penalties(metrics, penalties)
        scales = np.multiply(*scale_vectors)

        similarities = linear_kernel(X_pred, X, dense_output=False)
        similarities *= sparse.diags(scales)

        return metrics, penalties, scales, similarities
Ejemplo n.º 10
0
async def write_packages(packages_root, args):
    def get_connector_kwargs():
        if is_windows:  # pylint: disable=W1025
            return dict(limit=60)
        return dict()

    log_call()
    os.makedirs(packages_root, exist_ok=True)

    endpoint_url = get_endpoint_url(args.api_endpoint)
    async with NugetContext(endpoint_url=endpoint_url,
                            connector_kwargs=get_connector_kwargs()) as ctx:
        client = await NugetCatalogClient(ctx).load()
        page_start, page_end = args.page_start, args.page_start + (
            args.page_limit or sys.maxsize)
        pages = aislice(client.load_pages(), page_start, page_end)

        async for i, page in aenumerate(pages):
            pageno = page.pageno
            assert page_start + i == pageno

            fname = os.path.join(packages_root, 'page{}.csv'.format(pageno))
            if not args.force_refresh_packages and os.path.isfile(fname):
                LOG.debug("{} exists, skipping", fname)
                continue

            LOG.debug("Fetching packages for page #{}", pageno)
            try:
                with PackageSerializer(fname) as writer:
                    writer.write_header()
                    packages = list(page.packages)
                    results = await asyncio.gather(
                        *[package.load() for package in packages],
                        return_exceptions=True)
                    for package, result in zip(packages, results):
                        if isinstance(result, Exception
                                      ) and not can_ignore_exception(result):
                            raise result
                        writer.write(package)
            except:
                LOG.debug("Exception thrown, deleting {}", fname)
                with contextlib.suppress(FileNotFoundError):
                    os.remove(fname)
                raise
Ejemplo n.º 11
0
def gen_blobs(df, tagger, args, blobs_root, vectors_root):
    log_call()

    chunk_fmt = os.path.join(vectors_root, 'chunk{}.npz')
    chunkmgr = ChunkManager(chunk_fmt)

    if not args.force_refresh_vectors and os.path.isdir(vectors_root):
        LOG.debug("Using existing vectors from {}", vectors_root)
        trans = FeatureTransformer(tags_vocab=tagger.vocab_)
        trans.fit(df)
    else:
        shutil.rmtree(vectors_root, ignore_errors=True)
        os.makedirs(vectors_root, exist_ok=True)
        trans = FeatureTransformer(tags_vocab=tagger.vocab_,
                                   mode='chunked',
                                   chunkmgr=chunkmgr)
        trans.fit_transform(df)
        trans.mode = 'onego'

    if args.force_refresh_blobs:
        shutil.rmtree(blobs_root, ignore_errors=True)
    os.makedirs(blobs_root, exist_ok=True)
    for pageno in pagenos(df):
        dirname = os.path.join(blobs_root, 'page{}'.format(pageno))
        if not args.force_refresh_blobs and os.path.isdir(dirname):
            LOG.debug("Blobs for page #{} already exist in {}, skipping",
                      pageno, dirname)
            continue

        pagedf = get_page(df, pageno)
        pagefeats = trans.transform(pagedf)
        try:
            gen_blobs_for_page(pageno=pageno,
                               df=pagedf,
                               feats=pagefeats,
                               parentdf=df,
                               blobs_root=blobs_root,
                               chunkmgr=chunkmgr)
        except:
            LOG.debug("Exception thrown, removing {}", dirname)
            shutil.rmtree(dirname, ignore_errors=True)
            raise
Ejemplo n.º 12
0
 def fit_transform(self, df):
     log_call()
     self.vocab_ = sorted(
         set([tag for tags in df['tags'] for tag in _parse_tags(tags)]))
     self.idfs_ = _compute_idfs(df)
     return self._enrich_tags(df)
Ejemplo n.º 13
0
def _description_matrix(X, vocab):
    log_call()
    vectorizer = TfidfVectorizer(vocabulary=vocab, **_DESCRIPTION_KWARGS)
    return vectorizer.fit_transform(X['description'])
Ejemplo n.º 14
0
def _authors_matrix(X, vocab):
    log_call()
    vectorizer = TfidfVectorizer(vocabulary=vocab, **_AUTHORS_KWARGS)
    return vectorizer.fit_transform(X['authors'])
Ejemplo n.º 15
0
def add_etags(df):
    log_call()
    tagger = SmartTagger()
    df = tagger.fit_transform(df)
    return df, tagger
Ejemplo n.º 16
0
def add_downloads_per_day(df):
    log_call()
    df['downloads_per_day'] = df['total_downloads'] / df['days_alive']
    df.loc[(df['total_downloads'] == -1) | (df['days_alive'] == -1),
           'downloads_per_day'] = math.nan
    return df
Ejemplo n.º 17
0
def read_packages(packages_root, args):
    DEFAULT_DATETIME = datetime(year=1900, month=1, day=1)
    DATE_FEATURES = ['created', 'last_updated']

    def remove_duplicate_ids(df):
        df['id_lower'] = df['id'].apply(str.lower)
        # Keep the package with the highest version
        df.drop_duplicates(subset='id_lower', keep='last', inplace=True)
        df.drop(columns=['id_lower'], inplace=True)
        return df

    def remove_missing_info(df):
        df = df[~df['missing_info']]
        # These columns no longer have missing data, so we can set them to the correct type
        df['is_prerelease'] = df['is_prerelease'].astype(bool)
        df['listed'] = df['listed'].astype(bool)
        df['total_downloads'] = df['total_downloads'].astype(np.int32)
        df['verified'] = df['verified'].astype(bool)
        return df

    def remove_unlisted(df):
        df = df[df['listed']]
        df.drop(columns=['listed'], inplace=True)
        return df

    def use_nan_for_missing_values(df):
        features_and_defaults = [(['days_abandoned', 'days_alive'], -1),
                                 (DATE_FEATURES, DEFAULT_DATETIME)]
        for features, default in features_and_defaults:
            for feature in features:
                #assert all(~df[feature].isna())
                df.loc[df[feature] == default, feature] = math.nan
        return df

    log_call()
    pagedfs = []
    start, end = args.page_start, args.page_start + (args.page_limit
                                                     or sys.maxsize)

    for pageno in range(start, end):
        LOG.debug("Loading packages for page #{}", pageno)
        fname = os.path.join(packages_root, 'page{}.csv'.format(pageno))
        try:
            pagedf = pd.read_csv(fname,
                                 dtype=SCHEMA,
                                 na_filter=False,
                                 parse_dates=DATE_FEATURES)
            pagedf['pageno'] = pageno
            pagedfs.append(pagedf)
        except FileNotFoundError:
            LOG.debug("{} not found, stopping", fname)
            break

    df = pd.concat(pagedfs, ignore_index=True)

    pd.options.mode.chained_assignment = None
    try:
        df = remove_duplicate_ids(df)
        df = remove_missing_info(df)
        df = remove_unlisted(df)
        df = use_nan_for_missing_values(df)
        df.reset_index(drop=True, inplace=True)
    finally:
        pd.options.mode.chained_assignment = 'warn'

    return df
Ejemplo n.º 18
0
def _apply_penalties(metrics, penalties):
    log_call()
    assert len(metrics) == len(penalties)

    min_scales = [(1 - p) for p in penalties]
    return [1 * m + min_scale * (1 - m) for m, min_scale in zip(metrics, min_scales)]
Ejemplo n.º 19
0
def add_chunkno(df, args):
    log_call()
    assert args.pages_per_chunk > 0
    df['chunkno'] = np.floor(df['pageno'] / args.pages_per_chunk).astype(
        np.int32)
    return df