def parse_test(self): """Parse test.txt which has test subjects """ msg("parsing test.txt") lines = file('/'.join((self.datadir, "test.txt"))).read().split("\n") self.test_u = sorted([int(line) for line in lines if line])
def parse_lang(self): """Get lang.txt which has language composition information """ msg("parsing lang.txt") lines = file('/'.join((self.datadir, "lang.txt"))).read().split("\n") pairs = [line.split(":") for line in lines if line] pairs = [(int(pair[0]), [tuple(x.split(";")) for x in pair[1].split(",")]) for pair in pairs] pairs = [(x, tuple([(int(z[1]), z[0].lower()) for z in y])) for (x, y) in pairs] all_langs = defaultdict(bool) for repos, langs in pairs: for kloc, lang in langs: all_langs[lang] = True all_langs = sorted(all_langs.keys()) msg("build lang_by_r and r_langs") for repos, langs in pairs: for kloc, lang in langs: lnloc = int(log(kloc + 1, 10)) self.lang_by_r[lang].append((lnloc, repos)) self.r_langs[repos].append((lang, lnloc)) for lang in self.lang_by_r.keys(): self.lang_by_r[lang].sort(key=lambda x:x[1])
def fill_pickle_jar(self): jar = '/'.join((self.datadir, "pickle.jar")) d = {} msg("Filling pickle jar '%s'" % jar) for field in self.fields: d[field] = getattr(self, field) d['fields'] = self.fields jarf = open(jar, 'w') pickle.dump(d, jarf) jarf.close()
def parse_repos(self): """Parse repos.txt which has repository lineage information """ msg("parsing repos.txt") lines = file('/'.join((self.datadir, "repos.txt"))).read().split("\n") pairs = [line.replace(":", ",").split(",") for line in lines if line] pairs = [tuple([int(pair[0]), int(pair[3]) if pair[3:4] else 0, pair[1], pair[2]]) for pair in pairs] for repos, parent, name, creation in pairs: if parent > 0: self.forks_of_r[parent].append(repos) self.parent_of_r[repos] = parent author, name = name.split("/") words = [int(x) for x in creation.split("-")] creation = date(words[0], words[1], words[2]).toordinal() self.r_info[repos] = (author, name, creation) self.u_authoring[author].append(repos) self.r_name[name].append(repos) words = name.lower().replace("-", "_").replace(".", "_") words = words.split("_") prefixes = [w for w in words if len(w) > 2][:-1] if not prefixes: continue for i in xrange(1, len(prefixes)): prefix = "-".join(prefixes[0:i]) if prefix in ('the', 'test', 'php', 'acts'): continue self.r_prefixes[prefix].append(repos) for repos_gen1, repos_gen2 in self.parent_of_r.items(): if repos_gen2 in self.parent_of_r: repos_gen3 = self.parent_of_r[repos_gen2] self.gparent_of_r[repos_gen1] = repos_gen3
def summary(self, unabridged=False): props = ("watching_r " "u_watching " "r_info " "r_name " "r_langs " "forks_of_r " "parent_of_r " "gparent_of_r " "lang_by_r " "u_authoring ").split() for prop in props: print(">> %s" % prop) if unabridged: pprint(dict(getattr(self, prop).items())) else: pprint(dict(getattr(self, prop).items()[:5])) print("") msg("test_u") if unabridged: pprint(self.test_u) else: pprint(self.test_u[:5])
def process(self): db = self.database partition = 1 if partition > 1: new_len = len(db.test_u) / partition msg("Partitioning 1/%d [%d]" % (partition, new_len)) db.test_u = sorted(db.test_u)[:new_len] msg("Beginning recommendations") total = len(db.test_u) i = 0 for u in sorted(db.test_u, reverse=True): self.recommended[u] = self.user_process(u) i += 1 if i % 10 == 0: msg("[%3.2f%%] %d/%d processed" % (float(i)/float(total)*100.0, i, total))
def parse_watching(self): """Parse data.txt which has main user-repository relationships """ msg("parsing data.txt") lines = file('/'.join((self.datadir, "data.txt"))).read().split("\n") test_r = set() pairs = [[int(x) for x in line.split(":")] for line in lines if line] for user, repos in pairs: self.watching_r[repos].append(user) self.u_watching[user].append(repos) if user in self.test_u: test_r.add(repos) msg("calculating tf-idf") iter = 0 total_users = float(len(self.u_watching)) for repos, users in self.watching_r.items(): idf_repos = log(total_users / (1.0 + len(self.watching_r[repos]))) tf_idf_avg = 0.0 for user in users: tf_user = 1.0 / len(self.u_watching[user]) tf_idf = tf_user * idf_repos tf_idf_avg += tf_idf self.r_idf[repos].append((user, tf_idf)) # counter iter += 1 if iter % 10000 == 0: msg("tf-idf iter %d" % iter) self.r_idf_avg[repos] = tf_idf_avg / len(users) msg("making top_repos") top_repos = sorted(self.watching_r.items(), key=lambda x:len(x[1]), reverse=True) self.top_repos = [repos[0] for repos in top_repos[:50]] if not self.save_db: return conn = mysqldb.connect(host='127.0.0.1', user='******', passwd='', db='matrix') c = conn.cursor() iter = 0 values = [] msg("making u_matrix_fwd") users = sorted(self.u_watching.keys()) for i in xrange(len(users)): for j in xrange(i + 1, len(users)): s_i = set(self.u_watching[users[i]]) s_j = set(self.u_watching[users[j]]) diff = len(set.symmetric_difference(s_i, s_j)) values.append("(%d,%d,SQRT(%d))" % (users[i], users[j], diff)) iter += 1 if iter % 10000 == 0: sql = "".join(("INSERT INTO u_matrix_fwd(u1,u2,val) VALUES", ",".join(values))) c.execute(sql) conn.commit() values = [] msg("umf iter %d" % iter) if values: sql = "".join(("INSERT INTO u_matrix_fwd(u1,u2,val) VALUES", ",".join(values))) c.execute(sql) conn.commit() msg("umf iter %d [END]" % iter) iter = 0 values = [] msg("making u_matrix_bkwd") users = sorted(self.u_watching.keys(), reverse=True) for i in xrange(len(users)): for j in xrange(i + 1, len(users)): s_i = set(self.u_watching[users[i]]) s_j = set(self.u_watching[users[j]]) diff = len(set.symmetric_difference(s_i, s_j)) values.append("(%d,%d,SQRT(%d))" % (users[i], users[j], diff)) iter += 1 if iter % 10000 == 0: sql = "".join(("INSERT INTO u_matrix_bkwd(u1,u2,val) VALUES", ",".join(values))) c.execute(sql) conn.commit() values = [] msg("umb iter %d" % iter) if values: sql = "".join(("INSERT INTO u_matrix_bkwd(u1,u2,val) VALUES", ",".join(values))) c.execute(sql) conn.commit() msg("umb iter %d [END]" % iter) return # ------------------------------------------------------------ iter = 0 msg("making r_matrix_fwd") for repos in self.u_watching.values(): repos.sort() for i in xrange(len(repos)): for j in xrange(i + 1, len(repos)): r_i, r_j = repos[i], repos[j] if r_i not in self.r_matrix: self.r_matrix[r_i] = {r_j: 1} elif r_j not in self.r_matrix[r_i]: self.r_matrix[r_i][r_j] = 1 else: self.r_matrix[r_i][r_j] += 1 iter += 1 if iter % 100000 == 0: msg("[] iter %d" % iter) iter = 0 msg("saving r_matrix_fwd") values = [] for r_i in self.r_matrix: for r_j in self.r_matrix[r_i]: values.append("(%d,%d,%d)" % (r_i, r_j, self.r_matrix[r_i][r_j])) iter += 1 if iter % 5000 == 0: sql = "".join(("INSERT INTO r_matrix_fwd(r1,r2,val) VALUES", ",".join(values))) c.execute(sql) values = [] if iter % 10000 == 0: msg("DB iter %d" % iter) conn.commit() if values: sql = "".join(("INSERT INTO r_matrix_fwd(r1,r2,val) VALUES", ",".join(values))) c.execute(sql) iter = 0 msg("making r_matrix_bkwd") for repos in self.u_watching.values(): repos.sort(reverse=True) for i in xrange(len(repos)): for j in xrange(i + 1, len(repos)): r_i, r_j = repos[i], repos[j] if r_i not in self.r_matrix: self.r_matrix[r_i] = {r_j: 1} elif r_j not in self.r_matrix[r_i]: self.r_matrix[r_i][r_j] = 1 else: self.r_matrix[r_i][r_j] += 1 iter += 1 if iter % 100000 == 0: msg("[] iter %d" % iter) iter = 0 msg("saving r_matrix_bkwd") values = [] for r_i in self.r_matrix: for r_j in self.r_matrix[r_i]: values.append("(%d,%d,%d)" % (r_i, r_j, self.r_matrix[r_i][r_j])) iter += 1 if iter % 5000 == 0: sql = "".join(("INSERT INTO r_matrix_bkwd(r1,r2,val) VALUES", ",".join(values))) c.execute(sql) values = [] if iter % 10000 == 0: msg("DB iter %d" % iter) conn.commit() if values: sql = "".join(("INSERT INTO r_matrix_bkwd(r1,r2,val) VALUES", ",".join(values))) c.execute(sql)
def user_process(self, user): """Returns ten recommendations """ db = self.database if user not in db.u_watching: # blank son of a gun! msg("making local top_repos") top_repos = sorted(db.watching_r.items(), key=lambda x:sum([1 for y in x[1] if abs(user - y) < 250]), reverse=True) return [x[0] for x in top_repos][:10] r_info = db.r_info r_name = db.r_name r_langs = db.r_langs r_lang_tuple = db.r_lang_tuple r_prefixes = db.r_prefixes top_repos = db.top_repos lang_by_r = db.lang_by_r u_watching = db.u_watching watching_r = db.watching_r forks_of_r = db.forks_of_r parent_of_r = db.parent_of_r gparent_of_r = db.parent_of_r u_authoring = db.u_authoring scores = defaultdict(int) # find favorite author (by simple majority) fav_authors = {} """ # ignore fav_authors authors = defaultdict(int) for r in u_watching[user]: if r in r_info: author = r_info[r][0] authors[author] += 1 # grab top 3 authors authors = sorted(authors.items(), reverse=True, key=lambda x:x[1])[:3] if len(authors) > 1: total = float(sum([x[1] for x in authors])) for a_name, a_score in authors: if a_score == 1: continue # partition 16 appropriately fav_authors[a_name] = float(a_score) / float(total) * 16.0 msg(fav_authors.items()) msg("-" * 78) """ """ # generate language profile num_lang_r = 0 lang_r = defaultdict(int) for r in u_watching[user]: if r in r_langs: num_lang_r += 1 for lang, lnloc in r_langs[r]: lang_r[lang] += lnloc for lang in lang_r: lnloc = lang_r[lang] = num_lang_r for r1, lnloc2 in lang_by_r[lang]: if abs(lnloc2 - lnloc) <= 1: scores[r1] += 2.5 """ conn = mysqldb.connect(host='127.0.0.1', user='******', passwd='', db='matrix') c = conn.cursor() results = [] c.execute(("SELECT u2, val " "FROM u_matrix_fwd " "WHERE u1=%d " "ORDER BY val DESC " "LIMIT 5") % user) results += list(c.fetchall()) c.execute(("SELECT u2, val " "FROM u_matrix_bkwd " "WHERE u1=%d " "ORDER BY val DESC " "LIMIT 5") % user) results += list(c.fetchall()) results = sorted(dict(results).items(), reverse=True, key=lambda x:x[1]) user_s = set(u_watching[user]) r_neighbors = defaultdict(int) for u1, u_val in results[:5]: diff_s = set(u_watching[u1]) - user_s for r1 in diff_s: r_neighbors[r1] += 1 r_neighbors = sorted(r_neighbors.items(), reverse=True, key=lambda x:x[1])[:10] for r1, count in r_neighbors: scores[r1] += 0.5 * log(1 + len(u_watching[r1]), 10) for r in u_watching[user]: # loop through all watched repositories # check r_matrix results = [] c.execute(("SELECT r2, val " "FROM r_matrix_fwd " "WHERE r1=%d " "ORDER BY val DESC " "LIMIT 20") % r) results += list(c.fetchall()) c.execute(("SELECT r2, val " "FROM r_matrix_bkwd " "WHERE r1=%d " "ORDER BY val DESC " "LIMIT 20") % r) results += list(c.fetchall()) results = sorted(dict(results).items(), reverse=True, key=lambda x:x[1]) results = [result for result in results if result not in u_watching[user]] for r1, val in results[:5]: scores[r1] += log(val + len(watching_r[r1]), 10) # find forks for r1 in forks_of_r[r]: scores[r1] += log(2 + len(watching_r[r1]), 10) # find parents and siblings if parent_of_r[r] > 0: parent = parent_of_r[r] scores[parent] += 2 for r1 in forks_of_r[parent]: scores[r1] += log(2 + len(watching_r[r1]), 10) # find others by author of parent if r1 in r_info: author = r_info[r1][0] for r2 in u_authoring[author]: scores[r2] += 0.5 * log(2 + len(watching_r[r2]), 10) # find grandparents and uncles/aunts if gparent_of_r[r] > 0: gparent = gparent_of_r[r] scores[gparent] += 3 for r1 in forks_of_r[gparent]: scores[r1] += log(2 + len(watching_r[r1]), 10) # find others by author of gparent if r1 in r_info: author = r_info[r1][0] for r2 in u_authoring[author]: scores[r2] += log(2 + len(watching_r[r2]), 10) # find others by author, name and prefixes if r in r_info: author, name = r_info[r][0], r_info[r][1] for r1 in sorted(u_authoring[author], reverse=True): scores[r1] += 1.5 * log(2 + len(watching_r[r1]), 10) # check names if name in r_name: for r1 in r_name[name]: scores[r1] += log(1 + len(watching_r[r1]), 10) words = name.lower().replace("-", "_").replace(".", "_") words = words.split("_") prefixes = [w for w in words if len(w) > 2] if not prefixes: continue for i in xrange(1, len(prefixes) - 1): prefix = "-".join(prefixes[0:i]) if prefix in r_prefixes: for r2 in r_prefixes[prefix]: scores[r2] += (0.25 * i * log(1 + len(watching_r[r1]), 10)) """ if len(u_watching[user]) > 7: dates = [r_info[r][2] for r in u_watching[user] if r in r_info] # msg(dates) mean = sum(dates) / len(dates) # msg("mean is %s" % date(1,1,1).fromordinal(mean)) std_dev = (sum([(x - mean) ** 2 for x in dates]) / len(dates)) ** 0.5 threshold = std_dev * 2.5 # msg("std_dev is %f" % std_dev) for r1 in scores: if r1 not in r_info: continue created = r_info[r1][2] if abs(created - mean) > threshold: scores[r1] -= 10.0 """ # cleanup for r in u_watching[user] + [0]: try: del scores[r] except: pass orig_scores = scores scores = sorted(scores.items(), reverse=True, key=lambda x:x[1]) authors = defaultdict(int) names = defaultdict(int) purge = [] iter = 0 for r, score in scores: if r in r_info: author, name, _ = r_info[r] authors[author] += 1 names[name] += 1 if authors[author] > 2: purge.append(iter) elif names[name] > 5: purge.append(iter) iter += 1 for i in sorted(purge, reverse=True): del scores[i] # boost by date if False: dates = [] for r in u_watching[user]: if r not in r_info: continue dates.append(r_info[r][2]) dates = sorted(dates)[1:-1] mean = sum(dates) / len(dates) std_dev = (sum([(x - mean) ** 2 for x in dates]) / len(dates)) ** 0.5 if std_dev: for iter in xrange(10, len(scores)): r, score = scores[iter] if r not in r_info: continue boost = 1.0 - ((r_info[r][2] - mean) / (2.5 * std_dev)) ** 2.0 if boost > 0: score += boost scores[iter] = (r, score) if len(scores) > 3000: mean = sum([x[1] for x in scores]) / len(scores) std_dev = (sum([(x[1] - mean) ** 2 for x in scores]) / len(scores)) ** 0.5 cutoff = mean + std_dev * 2.5 scores_ = sorted([x for x in scores if x[1] > cutoff]) if scores_: scores = scores_ if True: output = [] fh = file("debug.txt", "a") output.append("user: %5d" % user) output.append("watching: %5d, scores: %5d" % (len(u_watching[user]), len(scores))) output.append("") for r in u_watching[user]: if r in r_info: output.append(" WATCH %8d: %-20s %-50s %s %s" % (r, r_info[r][0], r_info[r][1], r_info[r][2], date(1, 1, 1).fromordinal(r_info[r][2]))) else: output.append(" WATCH %8d" % r) output.append("-") scores_ = sorted(scores, key=lambda x:(1 if x in u_watching[user] else 0, x[1]), reverse=True) for r, score in scores_: if r in u_watching[user]: if r in r_info: output.append(" %6.3f %8d: %-20s %-50s %s %s" % (score, r, r_info[r][0], r_info[r][1], r_info[r][2], date(1, 1, 1).fromordinal(r_info[r][2]))) else: output.append(" %4.2f %8d" % (score, r)) else: if r in r_info: output.append("++ %6.3f %8d: %-20s %-50s %s %s" % (score, r, r_info[r][0], r_info[r][1], r_info[r][2], date(1, 1, 1).fromordinal(r_info[r][2]))) else: output.append("++ %4.2f %8d" % (score, r)) output.append("") fh.write("\n".join(output)) fh.close() scores.sort(reverse=True, key=lambda x:x[1]) top_scores = [repos[0] for repos in scores[:10]] num_scores = len(top_scores) if not num_scores: msg(" no scores! so, making local top_repos") top_repos = sorted(db.watching_r.items(), key=lambda x:sum([1 for y in x[1] if abs(user - y) < 250]), reverse=True) return [x[0] for x in top_repos][:10] else: avg_score = (float(sum([repos[1] for repos in scores[:num_scores]])) / num_scores) msg(" avg: %6.2f - 1st: %6.2f - last: %6.2f" % (avg_score, scores[0][1], scores[num_scores - 1][1])) if num_scores < 10: msg("making local top_repos since num_scores < 10") top_repos = sorted(db.watching_r.items(), key=lambda x:sum([1 for y in x[1] if abs(user - y) < 250]), reverse=True) top_repos = [x[0] for x in top_repos][:10] for r in top_repos: if r not in top_scores: top_scores.append(r) if len(top_scores) >= 10: break return top_scores