Example #1
0
    def parse_test(self):
        """Parse test.txt which has test subjects
        """

        msg("parsing test.txt")
        lines = file('/'.join((self.datadir, "test.txt"))).read().split("\n")
        self.test_u = sorted([int(line) for line in lines if line])
Example #2
0
    def parse_lang(self):
        """Get lang.txt which has language composition information
        """

        msg("parsing lang.txt")
        lines = file('/'.join((self.datadir, "lang.txt"))).read().split("\n")

        pairs = [line.split(":") for line in lines if line]
        pairs = [(int(pair[0]),
                  [tuple(x.split(";")) for x in pair[1].split(",")])
                 for pair in pairs]
        pairs = [(x, tuple([(int(z[1]), z[0].lower()) for z in y]))
                 for (x, y) in pairs]

        all_langs = defaultdict(bool)
        for repos, langs in pairs:
            for kloc, lang in langs:
                all_langs[lang] = True
        all_langs = sorted(all_langs.keys())

        msg("build lang_by_r and r_langs")
        for repos, langs in pairs:
            for kloc, lang in langs:
                lnloc = int(log(kloc + 1, 10))
                self.lang_by_r[lang].append((lnloc, repos))
                self.r_langs[repos].append((lang, lnloc))

        for lang in self.lang_by_r.keys():
            self.lang_by_r[lang].sort(key=lambda x:x[1])
Example #3
0
    def fill_pickle_jar(self):
        jar = '/'.join((self.datadir, "pickle.jar"))
        d = {}

        msg("Filling pickle jar '%s'" % jar)

        for field in self.fields:
            d[field] = getattr(self, field)
        d['fields'] = self.fields

        jarf = open(jar, 'w')
        pickle.dump(d, jarf)
        jarf.close()
Example #4
0
    def parse_repos(self):
        """Parse repos.txt which has repository lineage information
        """

        msg("parsing repos.txt")
        lines = file('/'.join((self.datadir, "repos.txt"))).read().split("\n")

        pairs = [line.replace(":", ",").split(",") for line in lines if line]
        pairs = [tuple([int(pair[0]),
                        int(pair[3]) if pair[3:4] else 0,
                        pair[1],
                        pair[2]])
                 for pair in pairs]

        for repos, parent, name, creation in pairs:
            if parent > 0:
                self.forks_of_r[parent].append(repos)
                self.parent_of_r[repos] = parent
            author, name = name.split("/")
            words = [int(x) for x in creation.split("-")]
            creation = date(words[0], words[1], words[2]).toordinal()
            self.r_info[repos] = (author, name, creation)
            self.u_authoring[author].append(repos)
            self.r_name[name].append(repos)

            words = name.lower().replace("-", "_").replace(".", "_")
            words = words.split("_")
            prefixes = [w for w in words if len(w) > 2][:-1]
            if not prefixes:
                continue

            for i in xrange(1, len(prefixes)):
                prefix = "-".join(prefixes[0:i])
                if prefix in ('the', 'test', 'php', 'acts'):
                    continue
                self.r_prefixes[prefix].append(repos)

        for repos_gen1, repos_gen2 in self.parent_of_r.items():
            if repos_gen2 in self.parent_of_r:
                repos_gen3 = self.parent_of_r[repos_gen2]
                self.gparent_of_r[repos_gen1] = repos_gen3
Example #5
0
    def summary(self, unabridged=False):
        props = ("watching_r "
                 "u_watching "
                 "r_info "
                 "r_name "
                 "r_langs "
                 "forks_of_r "
                 "parent_of_r "
                 "gparent_of_r "
                 "lang_by_r "
                 "u_authoring ").split()
        for prop in props:
            print(">> %s" % prop)
            if unabridged:
                pprint(dict(getattr(self, prop).items()))
            else:
                pprint(dict(getattr(self, prop).items()[:5]))
            print("")

        msg("test_u")
        if unabridged:
            pprint(self.test_u)
        else:
            pprint(self.test_u[:5])
Example #6
0
    def process(self):
        db = self.database

        partition = 1
        if partition > 1:
            new_len = len(db.test_u) / partition
            msg("Partitioning 1/%d [%d]" % (partition, new_len))
            db.test_u = sorted(db.test_u)[:new_len]

        msg("Beginning recommendations")
        total = len(db.test_u)
        i = 0
        for u in sorted(db.test_u, reverse=True):
            self.recommended[u] = self.user_process(u)
            i += 1
            if i % 10 == 0:
                msg("[%3.2f%%] %d/%d processed"
                    % (float(i)/float(total)*100.0, i, total))
Example #7
0
    def parse_watching(self):
        """Parse data.txt which has main user-repository relationships
        """

        msg("parsing data.txt")
        lines = file('/'.join((self.datadir, "data.txt"))).read().split("\n")

        test_r = set()
        pairs = [[int(x) for x in line.split(":")] for line in lines if line]
        for user, repos in pairs:
            self.watching_r[repos].append(user)
            self.u_watching[user].append(repos)

            if user in self.test_u:
                test_r.add(repos)

        msg("calculating tf-idf")
        iter = 0
        total_users = float(len(self.u_watching))
        for repos, users in self.watching_r.items():
            idf_repos = log(total_users / (1.0 + len(self.watching_r[repos])))
            tf_idf_avg = 0.0
            for user in users:
                tf_user = 1.0 / len(self.u_watching[user])
                tf_idf = tf_user * idf_repos
                tf_idf_avg += tf_idf
                self.r_idf[repos].append((user, tf_idf))

                # counter
                iter += 1
                if iter % 10000 == 0:
                    msg("tf-idf iter %d" % iter)
            self.r_idf_avg[repos] = tf_idf_avg / len(users)

        msg("making top_repos")
        top_repos = sorted(self.watching_r.items(),
                           key=lambda x:len(x[1]),
                           reverse=True)
        self.top_repos = [repos[0] for repos in top_repos[:50]]

        if not self.save_db:
            return

        conn = mysqldb.connect(host='127.0.0.1',
                               user='******',
                               passwd='',
                               db='matrix')
        c = conn.cursor()

        iter = 0
        values = []
        msg("making u_matrix_fwd")
        users = sorted(self.u_watching.keys())
        for i in xrange(len(users)):
            for j in xrange(i + 1, len(users)):
                s_i = set(self.u_watching[users[i]])
                s_j = set(self.u_watching[users[j]])

                diff = len(set.symmetric_difference(s_i, s_j))
                values.append("(%d,%d,SQRT(%d))" % (users[i], users[j], diff))

                iter += 1
                if iter % 10000 == 0:
                    sql = "".join(("INSERT INTO u_matrix_fwd(u1,u2,val) VALUES",
                                   ",".join(values)))
                    c.execute(sql)
                    conn.commit()
                    values = []
                    msg("umf iter %d" % iter)
        if values:
            sql = "".join(("INSERT INTO u_matrix_fwd(u1,u2,val) VALUES",
                           ",".join(values)))
            c.execute(sql)
            conn.commit()
            msg("umf iter %d [END]" % iter)

        iter = 0
        values = []
        msg("making u_matrix_bkwd")
        users = sorted(self.u_watching.keys(), reverse=True)
        for i in xrange(len(users)):
            for j in xrange(i + 1, len(users)):
                s_i = set(self.u_watching[users[i]])
                s_j = set(self.u_watching[users[j]])

                diff = len(set.symmetric_difference(s_i, s_j))
                values.append("(%d,%d,SQRT(%d))" % (users[i], users[j], diff))

                iter += 1
                if iter % 10000 == 0:
                    sql = "".join(("INSERT INTO u_matrix_bkwd(u1,u2,val) VALUES",
                                   ",".join(values)))
                    c.execute(sql)
                    conn.commit()
                    values = []
                    msg("umb iter %d" % iter)
        if values:
            sql = "".join(("INSERT INTO u_matrix_bkwd(u1,u2,val) VALUES",
                           ",".join(values)))
            c.execute(sql)
            conn.commit()
            msg("umb iter %d [END]" % iter)

        return
        # ------------------------------------------------------------

        iter = 0
        msg("making r_matrix_fwd")
        for repos in self.u_watching.values():
            repos.sort()
            for i in xrange(len(repos)):
                for j in xrange(i + 1, len(repos)):
                    r_i, r_j = repos[i], repos[j]

                    if r_i not in self.r_matrix:
                        self.r_matrix[r_i] = {r_j: 1}
                    elif r_j not in self.r_matrix[r_i]:
                        self.r_matrix[r_i][r_j] = 1
                    else:
                        self.r_matrix[r_i][r_j] += 1

                    iter += 1
                    if iter % 100000 == 0:
                        msg("[] iter %d" % iter)
        iter = 0
        msg("saving r_matrix_fwd")
        values = []
        for r_i in self.r_matrix:
            for r_j in self.r_matrix[r_i]:
                values.append("(%d,%d,%d)"
                              % (r_i, r_j, self.r_matrix[r_i][r_j]))
                iter += 1
                if iter % 5000 == 0:
                    sql = "".join(("INSERT INTO r_matrix_fwd(r1,r2,val) VALUES",
                                   ",".join(values)))
                    c.execute(sql)
                    values = []
                if iter % 10000 == 0:
                    msg("DB iter %d" % iter)
                    conn.commit()
        if values:
            sql = "".join(("INSERT INTO r_matrix_fwd(r1,r2,val) VALUES",
                           ",".join(values)))
            c.execute(sql)


        iter = 0
        msg("making r_matrix_bkwd")
        for repos in self.u_watching.values():
            repos.sort(reverse=True)
            for i in xrange(len(repos)):
                for j in xrange(i + 1, len(repos)):
                    r_i, r_j = repos[i], repos[j]

                    if r_i not in self.r_matrix:
                        self.r_matrix[r_i] = {r_j: 1}
                    elif r_j not in self.r_matrix[r_i]:
                        self.r_matrix[r_i][r_j] = 1
                    else:
                        self.r_matrix[r_i][r_j] += 1

                    iter += 1
                    if iter % 100000 == 0:
                        msg("[] iter %d" % iter)
        iter = 0
        msg("saving r_matrix_bkwd")
        values = []
        for r_i in self.r_matrix:
            for r_j in self.r_matrix[r_i]:
                values.append("(%d,%d,%d)"
                              % (r_i, r_j, self.r_matrix[r_i][r_j]))
                iter += 1
                if iter % 5000 == 0:
                    sql = "".join(("INSERT INTO r_matrix_bkwd(r1,r2,val) VALUES",
                                   ",".join(values)))
                    c.execute(sql)
                    values = []
                if iter % 10000 == 0:
                    msg("DB iter %d" % iter)
                    conn.commit()
        if values:
            sql = "".join(("INSERT INTO r_matrix_bkwd(r1,r2,val) VALUES",
                           ",".join(values)))
            c.execute(sql)
Example #8
0
    def user_process(self, user):
        """Returns ten recommendations
        """
        db = self.database

        if user not in db.u_watching:
            # blank son of a gun!
            msg("making local top_repos")
            top_repos = sorted(db.watching_r.items(),
                               key=lambda x:sum([1 for y in x[1]
                                                 if abs(user - y) < 250]),
                               reverse=True)
            return [x[0] for x in top_repos][:10]

        r_info = db.r_info
        r_name = db.r_name
        r_langs = db.r_langs
        r_lang_tuple = db.r_lang_tuple
        r_prefixes = db.r_prefixes
        top_repos = db.top_repos
        lang_by_r = db.lang_by_r
        u_watching = db.u_watching
        watching_r = db.watching_r
        forks_of_r = db.forks_of_r
        parent_of_r = db.parent_of_r
        gparent_of_r = db.parent_of_r
        u_authoring = db.u_authoring

        scores = defaultdict(int)

        # find favorite author (by simple majority)
        fav_authors = {}
        """ # ignore fav_authors
        authors = defaultdict(int)
        for r in u_watching[user]:
            if r in r_info:
                author = r_info[r][0]
                authors[author] += 1

        # grab top 3 authors
        authors = sorted(authors.items(), reverse=True, key=lambda x:x[1])[:3]
        if len(authors) > 1:
            total = float(sum([x[1] for x in authors]))
            
            for a_name, a_score in authors:
                if a_score == 1:
                    continue

                # partition 16 appropriately
                fav_authors[a_name] = float(a_score) / float(total) * 16.0

        msg(fav_authors.items())
        msg("-" * 78)
        """        
        
        """
        # generate language profile
        num_lang_r = 0
        lang_r = defaultdict(int)
        for r in u_watching[user]:
            if r in r_langs:
                num_lang_r += 1
                for lang, lnloc in r_langs[r]:
                    lang_r[lang] += lnloc
        for lang in lang_r:
            lnloc = lang_r[lang] = num_lang_r
            for r1, lnloc2 in lang_by_r[lang]:
                if abs(lnloc2 - lnloc) <= 1:
                    scores[r1] += 2.5
        """

        conn = mysqldb.connect(host='127.0.0.1',
                               user='******',
                               passwd='',
                               db='matrix')
        c = conn.cursor()

        results = []
        c.execute(("SELECT u2, val "
                   "FROM u_matrix_fwd "
                   "WHERE u1=%d "
                   "ORDER BY val DESC "
                   "LIMIT 5")
                  % user)
        results += list(c.fetchall())
        c.execute(("SELECT u2, val "
                   "FROM u_matrix_bkwd "
                   "WHERE u1=%d "
                   "ORDER BY val DESC "
                   "LIMIT 5")
                  % user)
        results += list(c.fetchall())
        results = sorted(dict(results).items(),
                         reverse=True,
                         key=lambda x:x[1])
        user_s = set(u_watching[user])
        r_neighbors = defaultdict(int)

        for u1, u_val in results[:5]:
            diff_s = set(u_watching[u1]) - user_s
            for r1 in diff_s:
                r_neighbors[r1] += 1

        r_neighbors = sorted(r_neighbors.items(),
                             reverse=True,
                             key=lambda x:x[1])[:10]
        for r1, count in r_neighbors:
            scores[r1] += 0.5 * log(1 + len(u_watching[r1]), 10)

        for r in u_watching[user]:
            # loop through all watched repositories

            # check r_matrix

            results = []
            c.execute(("SELECT r2, val "
                       "FROM r_matrix_fwd "
                       "WHERE r1=%d "
                       "ORDER BY val DESC "
                       "LIMIT 20")
                      % r)
            results += list(c.fetchall())
            c.execute(("SELECT r2, val "
                       "FROM r_matrix_bkwd "
                       "WHERE r1=%d "
                       "ORDER BY val DESC "
                       "LIMIT 20")
                      % r)
            results += list(c.fetchall())
            results = sorted(dict(results).items(),
                             reverse=True,
                             key=lambda x:x[1])
            results = [result for result in results
                       if result not in u_watching[user]]
            for r1, val in results[:5]:
                scores[r1] += log(val + len(watching_r[r1]), 10)

            # find forks
            for r1 in forks_of_r[r]:
                scores[r1] += log(2 + len(watching_r[r1]), 10)

            # find parents and siblings
            if parent_of_r[r] > 0:
                parent = parent_of_r[r]
                scores[parent] += 2
                for r1 in forks_of_r[parent]:
                    scores[r1] += log(2 + len(watching_r[r1]), 10)

                    # find others by author of parent
                    if r1 in r_info:
                        author = r_info[r1][0]
                        for r2 in u_authoring[author]:
                            scores[r2] += 0.5 * log(2 + len(watching_r[r2]), 10)

            # find grandparents and uncles/aunts
            if gparent_of_r[r] > 0:
                gparent = gparent_of_r[r]
                scores[gparent] += 3
                for r1 in forks_of_r[gparent]:
                    scores[r1] += log(2 + len(watching_r[r1]), 10)

                    # find others by author of gparent
                    if r1 in r_info:
                        author = r_info[r1][0]
                        for r2 in u_authoring[author]:
                            scores[r2] += log(2 + len(watching_r[r2]), 10)

            # find others by author, name and prefixes
            if r in r_info:
                author, name = r_info[r][0], r_info[r][1]
                for r1 in sorted(u_authoring[author], reverse=True):
                    scores[r1] += 1.5 * log(2 + len(watching_r[r1]), 10)

                # check names
                if name in r_name:
                    for r1 in r_name[name]:
                        scores[r1] += log(1 + len(watching_r[r1]), 10)

                words = name.lower().replace("-", "_").replace(".", "_")
                words = words.split("_")
                prefixes = [w for w in words if len(w) > 2]
                if not prefixes:
                    continue

                for i in xrange(1, len(prefixes) - 1):
                    prefix = "-".join(prefixes[0:i])
                    if prefix in r_prefixes:
                        for r2 in r_prefixes[prefix]:
                            scores[r2] += (0.25 * i
                                           * log(1 + len(watching_r[r1]), 10))

        """
        if len(u_watching[user]) > 7:
            dates = [r_info[r][2]
                     for r in u_watching[user]
                     if r in r_info]
            # msg(dates)
            mean = sum(dates) / len(dates)
            # msg("mean is %s" % date(1,1,1).fromordinal(mean))

            std_dev = (sum([(x - mean) ** 2 for x in dates])
                       / len(dates)) ** 0.5
            threshold = std_dev * 2.5
            # msg("std_dev is %f" % std_dev)

            for r1 in scores:
                if r1 not in r_info:
                    continue
                
                created = r_info[r1][2]
                if abs(created - mean) > threshold:
                    scores[r1] -= 10.0
        """
        # cleanup
        for r in u_watching[user] + [0]:
            try:
                del scores[r]
            except:
                pass

        orig_scores = scores
        scores = sorted(scores.items(), reverse=True, key=lambda x:x[1])
        authors = defaultdict(int)
        names = defaultdict(int)
        purge = []
        iter = 0
        for r, score in scores:
            if r in r_info:
                author, name, _ = r_info[r]
                authors[author] += 1
                names[name] += 1

                if authors[author] > 2:
                    purge.append(iter)
                elif names[name] > 5:
                    purge.append(iter)
            iter += 1

        for i in sorted(purge, reverse=True):
            del scores[i]

        # boost by date
        if False:
            dates = []
            for r in u_watching[user]:
                if r not in r_info:
                    continue
                dates.append(r_info[r][2])
            dates = sorted(dates)[1:-1]
            mean = sum(dates) / len(dates)
            std_dev = (sum([(x - mean) ** 2 for x in dates])
                       / len(dates)) ** 0.5
            if std_dev:
                for iter in xrange(10, len(scores)):
                    r, score = scores[iter]
                    if r not in r_info:
                        continue
                    boost = 1.0 - ((r_info[r][2] - mean) / (2.5 * std_dev)) ** 2.0
                    if boost > 0:
                        score += boost
                        scores[iter] = (r, score)
            
        if len(scores) > 3000:
            mean = sum([x[1] for x in scores]) / len(scores)
            std_dev = (sum([(x[1] - mean) ** 2
                            for x in scores])
                       / len(scores)) ** 0.5
            cutoff = mean + std_dev * 2.5
            scores_ = sorted([x for x in scores if x[1] > cutoff])
            if scores_:
                scores = scores_
    
        if True:
            output = []
            fh = file("debug.txt", "a")
            output.append("user: %5d" % user)
            output.append("watching: %5d, scores: %5d" % (len(u_watching[user]), len(scores)))
            output.append("")

            for r in u_watching[user]:
                if r in r_info:
                    output.append("    WATCH %8d: %-20s %-50s %s %s"
                                  % (r,
                                     r_info[r][0],
                                     r_info[r][1],
                                     r_info[r][2],
                                     date(1, 1, 1).fromordinal(r_info[r][2])))
                else:
                    output.append("    WATCH %8d" % r)
            output.append("-")

            scores_ = sorted(scores,
                             key=lambda x:(1 if x in u_watching[user] else 0, x[1]),
                             reverse=True)
            for r, score in scores_:
                if r in u_watching[user]:
                    if r in r_info:
                        output.append("   %6.3f %8d: %-20s %-50s %s %s"
                                      % (score, r,
                                         r_info[r][0],
                                         r_info[r][1],
                                         r_info[r][2],
                                         date(1, 1, 1).fromordinal(r_info[r][2])))
                    else:
                        output.append("   %4.2f %8d"
                                      % (score, r))

                else:
                    if r in r_info:
                        output.append("++ %6.3f %8d: %-20s %-50s %s %s"
                                      % (score, r,
                                         r_info[r][0],
                                         r_info[r][1],
                                         r_info[r][2],
                                         date(1, 1, 1).fromordinal(r_info[r][2])))
                    else:
                        output.append("++ %4.2f %8d"
                                      % (score, r))
            output.append("")
            fh.write("\n".join(output))
            fh.close()

        scores.sort(reverse=True, key=lambda x:x[1])
        top_scores = [repos[0] for repos in scores[:10]]
        num_scores = len(top_scores)

        if not num_scores:
            msg("  no scores! so, making local top_repos")
            top_repos = sorted(db.watching_r.items(),
                               key=lambda x:sum([1 for y in x[1]
                                                 if abs(user - y) < 250]),
                               reverse=True)
            return [x[0] for x in top_repos][:10]
        else:
            avg_score = (float(sum([repos[1]
                                    for repos in scores[:num_scores]]))
                         / num_scores)
            msg("  avg: %6.2f - 1st: %6.2f - last: %6.2f"
                % (avg_score, scores[0][1], scores[num_scores - 1][1]))

        if num_scores < 10:
            msg("making local top_repos since num_scores < 10")
            top_repos = sorted(db.watching_r.items(),
                               key=lambda x:sum([1 for y in x[1]
                                                 if abs(user - y) < 250]),
                               reverse=True)
            top_repos = [x[0] for x in top_repos][:10]
            for r in top_repos:
                if r not in top_scores:
                    top_scores.append(r)
                if len(top_scores) >= 10:
                    break

        return top_scores