def predict(self, X, df): log_call() if self.mode == 'chunked': assert self._n_filled == self.n_total assert X.shape[0] == self.n_pred result = {} m = X.shape[0] csr = self.similarities_.tocsr() ids, dpds = list(df['id']), list(df['downloads_per_day']) for index in range(m): id_, dpd = ids[index], dpds[index] dpd_cutoff = max(self.min_dpd, (dpd / self.min_dpd_ratio)) left, right = csr.indptr[index], csr.indptr[index + 1] indices, data = csr.indices[left:right], csr.data[left:right] rec_indices = indices[(-data).argsort()] rec_indices = (i for i in rec_indices if self._ids[i] != id_) rec_indices = (i for i in rec_indices if self._dpds[i] >= dpd_cutoff) rec_indices = islice(rec_indices, self.n_recs) recs = [self._ids[i] for i in rec_indices] result[id_] = recs return result
def _compute_weights(self, df): log_call() m = df.shape[0] t = len(self.vocab_) weights = sparse.lil_matrix((m, t)) index_map = {tag: index for index, tag in enumerate(self.vocab_)} weight = self.weights['tags'] for rowidx in range(m): for tag in _parse_tags(df['tags'][rowidx]): colidx = index_map[tag] idf = self.idfs_[tag] weights[rowidx, colidx] = weight * idf cv = CountVectorizer(vocabulary=self.vocab_) for feature in ('description', 'id'): weight = self.weights[feature] counts = cv.transform(df[feature]) for rowidx, colidx in zip(*counts.nonzero()): term = self.vocab_[colidx] if _is_hackword(term): # IDF alone seems to be working better than TF-IDF, so ignore TF idf = self.idfs_[term] weights[rowidx, colidx] += weight * idf return weights
def _freshness_vector(X): log_call() da = X['days_abandoned'].values da[np.isnan(da)] = np.nanmean(da) m, M = np.min(da), np.max(da) da = (da - m) / (M - m) return 1 - da
def _popularity_vector(X): log_call() dpd = X['downloads_per_day'].values log_dpd = np.log1p(dpd) log_dpd[np.isnan(log_dpd)] = np.nanmean(log_dpd) m, M = np.min(log_dpd), np.max(log_dpd) log_dpd = (log_dpd - m) / (M - m) return log_dpd
def _make_etags(self, weights): log_call() m = weights.shape[0] etags_col = pd.Series('', index=np.arange(m)) nonzero = zip(*weights.nonzero()) for rowidx, entries in groupby(nonzero, key=lambda entry: entry[0]): colidxs = [entry[1] for entry in entries] etags = ','.join(self._make_etags_for_row(weights, rowidx, colidxs)) etags_col[rowidx] = etags return etags_col
def dump_etags(df, fname, include_weights): def get_tag(etag): tag, _ = etag.split(' ') return tag log_call() m = df.shape[0] with open(fname, 'w', encoding='utf-8') as file: for index in range(m): id_, etags = df['id'][index], df['etags'][index] if not include_weights and etags: etags = ','.join(map(get_tag, etags.split(','))) line = "{}: {}\n".format(id_, etags) file.write(line)
def _etags_matrix(X, vocab): log_call() m, t = X.shape[0], len(vocab) tag_weights = sparse.lil_matrix((m, t)) index_map = {tag: index for index, tag in enumerate(vocab)} for rowidx, etags in enumerate(X['etags']): if etags: for etag in etags.split(','): tag, weight = etag.split() colidx = index_map[tag] tag_weights[rowidx, colidx] = np.float64(weight) return tag_weights.tocsr()
def _compute_idfs(df): log_call() # IDF (inverse document frequency) formula: log N / n_t # N is the number of documents (aka packages) # n_t is the number of documents tagged with term t m = df.shape[0] # aka N nts = {} for index in range(m): seen = {} for tag in _parse_tags(df['tags'][index]): if not seen.get(tag, False): nts[tag] = nts.get(tag, 0) + 1 seen[tag] = True log10_m = np.log10(m) return {tag: log10_m - np.log10(nt) for tag, nt in nts.items()}
def _fit(self, X, df, X_pred, df_pred): # pylint: disable=W0613 log_call() assert sparse.isspmatrix_csr(X) metrics_and_penalties = [ (_freshness_vector(df), self.penalties['freshness']), (_popularity_vector(df), self.penalties['popularity']), ] metrics, penalties = zip(*metrics_and_penalties) scale_vectors = _apply_penalties(metrics, penalties) scales = np.multiply(*scale_vectors) similarities = linear_kernel(X_pred, X, dense_output=False) similarities *= sparse.diags(scales) return metrics, penalties, scales, similarities
async def write_packages(packages_root, args): def get_connector_kwargs(): if is_windows: # pylint: disable=W1025 return dict(limit=60) return dict() log_call() os.makedirs(packages_root, exist_ok=True) endpoint_url = get_endpoint_url(args.api_endpoint) async with NugetContext(endpoint_url=endpoint_url, connector_kwargs=get_connector_kwargs()) as ctx: client = await NugetCatalogClient(ctx).load() page_start, page_end = args.page_start, args.page_start + ( args.page_limit or sys.maxsize) pages = aislice(client.load_pages(), page_start, page_end) async for i, page in aenumerate(pages): pageno = page.pageno assert page_start + i == pageno fname = os.path.join(packages_root, 'page{}.csv'.format(pageno)) if not args.force_refresh_packages and os.path.isfile(fname): LOG.debug("{} exists, skipping", fname) continue LOG.debug("Fetching packages for page #{}", pageno) try: with PackageSerializer(fname) as writer: writer.write_header() packages = list(page.packages) results = await asyncio.gather( *[package.load() for package in packages], return_exceptions=True) for package, result in zip(packages, results): if isinstance(result, Exception ) and not can_ignore_exception(result): raise result writer.write(package) except: LOG.debug("Exception thrown, deleting {}", fname) with contextlib.suppress(FileNotFoundError): os.remove(fname) raise
def gen_blobs(df, tagger, args, blobs_root, vectors_root): log_call() chunk_fmt = os.path.join(vectors_root, 'chunk{}.npz') chunkmgr = ChunkManager(chunk_fmt) if not args.force_refresh_vectors and os.path.isdir(vectors_root): LOG.debug("Using existing vectors from {}", vectors_root) trans = FeatureTransformer(tags_vocab=tagger.vocab_) trans.fit(df) else: shutil.rmtree(vectors_root, ignore_errors=True) os.makedirs(vectors_root, exist_ok=True) trans = FeatureTransformer(tags_vocab=tagger.vocab_, mode='chunked', chunkmgr=chunkmgr) trans.fit_transform(df) trans.mode = 'onego' if args.force_refresh_blobs: shutil.rmtree(blobs_root, ignore_errors=True) os.makedirs(blobs_root, exist_ok=True) for pageno in pagenos(df): dirname = os.path.join(blobs_root, 'page{}'.format(pageno)) if not args.force_refresh_blobs and os.path.isdir(dirname): LOG.debug("Blobs for page #{} already exist in {}, skipping", pageno, dirname) continue pagedf = get_page(df, pageno) pagefeats = trans.transform(pagedf) try: gen_blobs_for_page(pageno=pageno, df=pagedf, feats=pagefeats, parentdf=df, blobs_root=blobs_root, chunkmgr=chunkmgr) except: LOG.debug("Exception thrown, removing {}", dirname) shutil.rmtree(dirname, ignore_errors=True) raise
def fit_transform(self, df): log_call() self.vocab_ = sorted( set([tag for tags in df['tags'] for tag in _parse_tags(tags)])) self.idfs_ = _compute_idfs(df) return self._enrich_tags(df)
def _description_matrix(X, vocab): log_call() vectorizer = TfidfVectorizer(vocabulary=vocab, **_DESCRIPTION_KWARGS) return vectorizer.fit_transform(X['description'])
def _authors_matrix(X, vocab): log_call() vectorizer = TfidfVectorizer(vocabulary=vocab, **_AUTHORS_KWARGS) return vectorizer.fit_transform(X['authors'])
def add_etags(df): log_call() tagger = SmartTagger() df = tagger.fit_transform(df) return df, tagger
def add_downloads_per_day(df): log_call() df['downloads_per_day'] = df['total_downloads'] / df['days_alive'] df.loc[(df['total_downloads'] == -1) | (df['days_alive'] == -1), 'downloads_per_day'] = math.nan return df
def read_packages(packages_root, args): DEFAULT_DATETIME = datetime(year=1900, month=1, day=1) DATE_FEATURES = ['created', 'last_updated'] def remove_duplicate_ids(df): df['id_lower'] = df['id'].apply(str.lower) # Keep the package with the highest version df.drop_duplicates(subset='id_lower', keep='last', inplace=True) df.drop(columns=['id_lower'], inplace=True) return df def remove_missing_info(df): df = df[~df['missing_info']] # These columns no longer have missing data, so we can set them to the correct type df['is_prerelease'] = df['is_prerelease'].astype(bool) df['listed'] = df['listed'].astype(bool) df['total_downloads'] = df['total_downloads'].astype(np.int32) df['verified'] = df['verified'].astype(bool) return df def remove_unlisted(df): df = df[df['listed']] df.drop(columns=['listed'], inplace=True) return df def use_nan_for_missing_values(df): features_and_defaults = [(['days_abandoned', 'days_alive'], -1), (DATE_FEATURES, DEFAULT_DATETIME)] for features, default in features_and_defaults: for feature in features: #assert all(~df[feature].isna()) df.loc[df[feature] == default, feature] = math.nan return df log_call() pagedfs = [] start, end = args.page_start, args.page_start + (args.page_limit or sys.maxsize) for pageno in range(start, end): LOG.debug("Loading packages for page #{}", pageno) fname = os.path.join(packages_root, 'page{}.csv'.format(pageno)) try: pagedf = pd.read_csv(fname, dtype=SCHEMA, na_filter=False, parse_dates=DATE_FEATURES) pagedf['pageno'] = pageno pagedfs.append(pagedf) except FileNotFoundError: LOG.debug("{} not found, stopping", fname) break df = pd.concat(pagedfs, ignore_index=True) pd.options.mode.chained_assignment = None try: df = remove_duplicate_ids(df) df = remove_missing_info(df) df = remove_unlisted(df) df = use_nan_for_missing_values(df) df.reset_index(drop=True, inplace=True) finally: pd.options.mode.chained_assignment = 'warn' return df
def _apply_penalties(metrics, penalties): log_call() assert len(metrics) == len(penalties) min_scales = [(1 - p) for p in penalties] return [1 * m + min_scale * (1 - m) for m, min_scale in zip(metrics, min_scales)]
def add_chunkno(df, args): log_call() assert args.pages_per_chunk > 0 df['chunkno'] = np.floor(df['pageno'] / args.pages_per_chunk).astype( np.int32) return df