Ejemplo n.º 1
0
    def so_clouds(self, year):
        mask = np.array(
            Image.open(
                "/home/iraklis/Desktop/StackOverflow/Media/Images/WordClouds/proper_circle.png"
            ))
        anno_coms = tools.load_pickle(self.path + "Communities0/" + str(year) +
                                      "_annotated_communites.pickle")
        tags = tools.load_pickle(self.path + "Communities0/" + str(year) +
                                 "_top_tags_per_community.pickle")

        input_dict = self.convert_to_list_of_dict(tags)
        list_of_colors = self.choose_colors(anno_coms)

        for idx, com in enumerate(anno_coms):
            wc = WordCloud(background_color=list_of_colors[idx],
                           mask=mask,
                           max_words=60,
                           prefer_horizontal=1,
                           contour_width=0.1,
                           collocations=False,
                           margin=1,
                           width=660,
                           height=660,
                           color_func=lambda *args, **kwargs: (0, 0, 0))
            wc.generate_from_frequencies(input_dict[idx])
            wc.to_file(self.path + "Word_Clouds/" + str(year) + "/" +
                       com.replace('/', '') + ".png")
Ejemplo n.º 2
0
    def overall(self):
        # Calculating the number of users/nodes of each community
        com_sizes = dict()
        percentage_list = list()
        for idx, c_date in enumerate(self.all_dates[:-1]):
            prev_nodes = set()
            next_nodes = set()

            com_nodes_1 = tools.load_pickle(self.path + c_date +
                                            "_infomap_coms.pickle")
            com_names_1 = tools.load_pickle(self.path + c_date +
                                            "_annotated_communites.pickle")
            com_nodes_2 = tools.load_pickle(self.path +
                                            self.all_dates[idx + 1] +
                                            "_infomap_coms.pickle")
            com_names_2 = tools.load_pickle(self.path +
                                            self.all_dates[idx + 1] +
                                            "_annotated_communites.pickle")

            for node_list in com_nodes_1:
                for node in node_list:
                    prev_nodes.add(node)
            for node_list in com_nodes_2:
                for node in node_list:
                    next_nodes.add(node)
            new_users = next_nodes - prev_nodes
            percentage_list.append(len(new_users) / len(next_nodes))
        print(np.mean(percentage_list))
Ejemplo n.º 3
0
    def community_timeseries(self, com_category, format):
        # date for community based time series
        s_date_obj = parse("2008-01-01T00:00:00.000")
        e_date_obj = parse("2021-01-01T00:00:00.000")
        m_delta = relativedelta(years=1)
        date_strings = list()
        all_coms = set()
        community_timeseries_dict = dict()

        current_date_obj = s_date_obj
        while current_date_obj < e_date_obj:
            date_strings.append(current_date_obj)
            current_date_obj += m_delta
        m_dates = matplotlib.dates.date2num(date_strings)

        # gathering community names
        for c_year in self.all_dates:
            annotated = tools.load_pickle(self.path + c_year +
                                          "_annotated_communites.pickle")
            for com in annotated:
                all_coms.add(com)
        for com in all_coms:
            community_timeseries_dict[com] = [0] * 13
        for idx, year in enumerate(self.all_dates):
            total_nodes = 0
            com_nodes = tools.load_pickle(self.path + year +
                                          "_infomap_coms.pickle")
            for com in com_nodes:
                total_nodes += len(com)
            annotated = tools.load_pickle(self.path + year +
                                          "_annotated_communites.pickle")
            for com, com_name in zip(com_nodes, annotated):
                community_timeseries_dict[com_name][idx] = len(
                    com) / total_nodes

        for com in self.display_coms[com_category]:
            plt.plot_date(m_dates,
                          community_timeseries_dict[com],
                          'b-',
                          color=self.color_dict[com],
                          label=com)

            plt.xlabel('Date')
            plt.ylabel('Percentage of Users')

            plt.title("Community Sizes per Year")
            plt.xticks(rotation=45)

        # defining the limits of an axes
        axes = plt.gca()
        axes.set_ylim([0, 0.41])
        axes.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
        plt.legend()
        plt.savefig(self.path + "p_usrs_" + com_category + "." + format,
                    bbox_inches='tight',
                    format=format,
                    dpi=300)
Ejemplo n.º 4
0
    def create_relations(self, year):
        start_time = time.time()
        with open(
                self.files_path + "Graphs/Relations/" + str(year) +
                "_users_relations.csv", "w") as rel_file:
            rel_file.write("Source,Target,Weight\n")
            user_tag_dict = tools.load_pickle(self.files_path +
                                              "year_scores/" + str(year))
            normalized_user_tag_dict = dict()
            for user_id, tag_list in user_tag_dict.items():
                normalized_user_tag_dict[user_id] = [
                    tag_list[0],
                    self.normalize_tag_score(copy.deepcopy(tag_list[1])),
                    set(tag_list[1].keys())
                ]
            all_users_ids = list(normalized_user_tag_dict.keys())
            print(len(all_users_ids))
            for idx, outer_user_id in enumerate(all_users_ids[:-1]):
                print(idx)
                for inner_user_id in all_users_ids[idx + 1:]:
                    usr_distance2 = self.users_distance2(
                        normalized_user_tag_dict[outer_user_id],
                        normalized_user_tag_dict[inner_user_id])

                    rel_file.write(
                        str(outer_user_id) + ',' + str(inner_user_id) + ',' +
                        str(2 - usr_distance2) + "\n")
        tools.save_pickle(self.files_path + "year_scores/normalized_2020",
                          normalized_user_tag_dict)
        print("execution time", time.time() - start_time)
Ejemplo n.º 5
0
 def gather_all_names(self):
     for year in range(2008, 2021):
         annotated_coms = tools.load_pickle(self.path + "Communities0/" +
                                            str(year) +
                                            "_annotated_communites.pickle")
         for com in annotated_coms:
             self.all_com_names.add(com)
     print()
Ejemplo n.º 6
0
 def reform_post_lists(self):
     print("In reform post lists.")
     for filepath in glob.iglob(self.data_path + "Posts/posts_per_month/*"):
         post_dict = dict()
         file_name = filepath.split("/")[-1]
         print(file_name)
         post_list = tools.load_pickle(filepath)
         for post in post_list:
             post_dict[int(post[3])] = (post[0], post[1], post[2])
         tools.save_pickle(self.data_path + "pivot_files/reformed_posts/" + file_name, post_dict)
     print("reform post lists done.")
Ejemplo n.º 7
0
    def process_scores(self):
        month_delta = relativedelta(months=1)
        current_date_obj = self.start_date_obj
        while current_date_obj < self.end_date_obj:
            date_string = str(current_date_obj.year) + "-" + str(
                current_date_obj.month)
            print(date_string)
            score_dict = tools.load_pickle(self.score_path + date_string)
            self.month_tag_scores[date_string] = self.slot_score(score_dict)

            current_date_obj += month_delta
        tools.save_pickle(self.score_path + "month_tag_scores",
                          self.month_tag_scores)
Ejemplo n.º 8
0
 def create_date_indexes(self):
     print("In create_date_indexes.")
     current_date_obj = self.start_date_obj
     month_delta = relativedelta(months=1)
     while current_date_obj < self.end_date_obj:
         date_string = str(current_date_obj.year) + "-" + str(current_date_obj.month)
         print(date_string)
         post_dict = tools.load_pickle(self.data_path + "Posts/posts_per_month/" + date_string)
         month_set = set()
         for post in post_dict:
             month_set.add(post[3])
         current_date_obj += month_delta
         tools.save_pickle(self.data_path + "pivot_files/date_to_postid/" + date_string, month_set)
     print("create_date_indexes done.")
Ejemplo n.º 9
0
 def create_date_index(self):
     set_list = list()
     start_date_obj = parse("2008-08-01T00:00:00.000")
     end_date_obj = parse("2020-12-31T00:00:00.000")
     month_delta = relativedelta(months=1)
     while start_date_obj < end_date_obj:
         date_string = str(start_date_obj.year) + "-" + str(start_date_obj.month)
         post_dict = tools.load_pickle(self.data_path + "Posts/posts_per_month/" + date_string)
         month_set = set()
         for post in post_dict:
             month_set.add(post[3])
         set_list.append((date_string, month_set))
         start_date_obj += month_delta
     print("Date index created.")
     print(len(set_list))
     return set_list
Ejemplo n.º 10
0
    def timeseries_plots(self):
        # tag_string = ["android", "ios", "windows", "linux", "unix"]
        # tag_string = ["c++", "c", "python", "java", "r", "ruby", "javascript", "php", "c#"]
        tag_string = [
            "reactjs", "ruby-on-rails", "asp.net", "angular", "angularjs",
            "django", "vue.js", "laravel", "spring", "flask"
        ]
        date_strings = list()
        tag_dict = tools.load_pickle(
            "/home/iraklis/PycharmProjects/SO_New/SRC3/Revisions/I_O/Scores/"
            "tag_timeseries.pickle")
        default_colors = [
            'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple',
            'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan',
            'fuchsia', 'black'
        ]

        # Create a list of the date strings
        current_date_obj = self.start_date_obj
        month_delta = relativedelta(months=1)
        while current_date_obj < self.end_date_obj:
            date_strings.append(current_date_obj)
            current_date_obj += month_delta
        m_dates = matplotlib.dates.date2num(date_strings)

        for idx, tag in enumerate(tag_string):
            plt.plot_date(m_dates,
                          tag_dict[tag],
                          'b-',
                          color=default_colors[idx],
                          label=tag)

        plt.xlabel('Date')
        plt.ylabel('Vote Score')

        plt.title("Web Frameworks")
        # plt.title("Programming Languages")
        # plt.title("Operating Systems")
        plt.legend()
        plt.xticks(rotation=45)

        plt.savefig(self.path + "/web_frameworks.png",
                    bbox_inches='tight',
                    format="png",
                    dpi=300)
Ejemplo n.º 11
0
    def merge_months(self, year, start_month, end_month):
        year_dict = dict()
        for month in range(start_month, end_month):
            month_dict_of_tags = tools.load_pickle(self.files_path +
                                                   "Scores/" + str(year) +
                                                   "-" + str(month))
            for user_id, tag_list in month_dict_of_tags.items():

                if user_id in year_dict:
                    year_dict[user_id][0] += month_dict_of_tags[user_id][0]
                    for tag, tag_score in month_dict_of_tags[user_id][1].items(
                    ):
                        if tag in year_dict[user_id][1]:
                            year_dict[user_id][1][tag] += tag_score
                        else:
                            year_dict[user_id][1][tag] = tag_score
                else:
                    year_dict[user_id] = tag_list
        tools.save_pickle(self.files_path + "year_scores/" + str(year),
                          year_dict)
Ejemplo n.º 12
0
 def reform_votes(self):
     print("In reform votes.")
     current_date_obj = self.start_date_obj
     month_delta = relativedelta(months=1)
     while current_date_obj < self.end_date_obj:
         vote_date = str(current_date_obj.year) + "-" + str(current_date_obj.month)
         c_month_votes = tools.load_pickle(self.data_path + "Votes/votes_per_month/" + vote_date)
         reformed_votes = list()
         counter = 0
         for vote in c_month_votes:
             if counter % 10000 == 0:
                 print((counter / len(c_month_votes) * 100), "%")
             counter += 1
             for date_tuple in self.post_date_index:
                 if vote[1] in date_tuple[1]:
                     split_date = date_tuple[0].split('-')
                     temp_date_obj = datetime(int(split_date[0]), int(split_date[1]), 1)
                     reformed_votes.append((vote[0], vote[1], vote[2], temp_date_obj))
         reformed_votes.sort(key=itemgetter(3))
         tools.save_pickle(self.data_path + "pivot_files/reformed_votes/" + vote_date, reformed_votes)
         current_date_obj += month_delta
     print("Reform votes finished.")
Ejemplo n.º 13
0
    def working_on_votes(self):
        current_date_obj = self.start_date_obj
        month_delta = relativedelta(months=1)
        while current_date_obj < self.end_date_obj:
            vote_date = str(current_date_obj.year) + "-" + str(current_date_obj.month)
            print(vote_date)
            c_month_votes = tools.load_pickle(self.data_path + "pivot_files/reformed_votes/" + vote_date)
            vote_count = 0
            start_time = time.time()
            # The votes are ordered based on the date of the posts they are placed
            # this way we only need to load each date_to_id_to_post file once per date.
            post_date = "2008-8"
            date_to_id_to_post = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + post_date)

            # We will collect the answers with their responding question date and process them
            # with the same approach we did with the answers.
            answer_collection = list()

            for vote in c_month_votes:
                if vote_count % 10000 == 0:
                    print("Percentage: ", vote_count/len(c_month_votes) * 100, "%")
                    print("Execution time", time.time() - start_time)
                    start_time = time.time()
                vote_count += 1
                if str(vote[3].year) + "-" + str(vote[3].month) != post_date:
                    post_date = str(vote[3].year) + "-" + str(vote[3].month)
                    date_to_id_to_post = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + post_date)
                post_tuple = date_to_id_to_post[int(vote[1])]
                # if question
                if post_tuple[0] == '1':
                    # if the post is not deleted
                    if post_tuple[1] != -99:
                        # getting a list of the posts tags
                        tag_list = self.parse_tags(post_tuple[2])
                        if post_tuple[1] in self.date_to_user_score[vote_date]:
                            if vote[2] == '2':
                                self.date_to_user_score[vote_date][post_tuple[1]][0] += 10
                                for tag in tag_list:
                                    if tag in self.date_to_user_score[vote_date][post_tuple[1]][1]:
                                        self.date_to_user_score[vote_date][post_tuple[1]][1][tag] += 10
                                    else:
                                        self.date_to_user_score[vote_date][post_tuple[1]][1][tag] = 10
                        else:
                            if vote[2] == '2':
                                temp_tag_dict = dict()
                                for tag in tag_list:
                                    temp_tag_dict[tag] = 10
                                self.date_to_user_score[vote_date][post_tuple[1]] = [10, temp_tag_dict]

                # if answer
                if post_tuple[0] == '2':
                    if post_tuple[1] != -99:
                        question_date = self.get_post_date(post_tuple[2])
                        if question_date != 'no_post':
                            split_date = question_date.split('-')
                            temp_date_obj = datetime(int(split_date[0]), int(split_date[1]), 1)
                            answer_collection.append((post_tuple[0], post_tuple[1], post_tuple[2],
                                                      vote[2], temp_date_obj))

            answer_collection.sort(key=itemgetter(4))
            question_date = "2008-8"
            date_to_id_to_question = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + question_date)
            for answer in answer_collection:
                if str(answer[4].year) + "-" + str(answer[4].month) != question_date:
                    question_date = str(answer[4].year) + "-" + str(answer[4].month)
                    date_to_id_to_question = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/"
                                                               + question_date)
                question_tuple = date_to_id_to_question[int(answer[2])]
                tag_list = self.parse_tags(question_tuple[2])
                if answer[1] in self.date_to_user_score[vote_date]:
                    if answer[3] == '1':
                        self.date_to_user_score[vote_date][answer[1]][0] += 15
                        for tag in tag_list:
                            if tag in self.date_to_user_score[vote_date][answer[1]][1]:
                                self.date_to_user_score[vote_date][answer[1]][1][tag] += 15
                            else:
                                self.date_to_user_score[vote_date][answer[1]][1][tag] = 15
                    if answer[3] == '2':
                        self.date_to_user_score[vote_date][answer[1]][0] += 10
                        for tag in tag_list:
                            if tag in self.date_to_user_score[vote_date][answer[1]][1]:
                                self.date_to_user_score[vote_date][answer[1]][1][tag] += 10
                            else:
                                self.date_to_user_score[vote_date][answer[1]][1][tag] = 10
                else:
                    if answer[3] == '1':
                        temp_tag_dict = dict()
                        for tag in tag_list:
                            temp_tag_dict[tag] = 15
                        self.date_to_user_score[vote_date][answer[1]] = [15, temp_tag_dict]
                    if answer[3] == '2':
                        temp_tag_dict = dict()
                        for tag in tag_list:
                            temp_tag_dict[tag] = 10
                        self.date_to_user_score[vote_date][answer[1]] = [10, temp_tag_dict]
            current_date_obj += month_delta
        tools.save_pickle(self.data_path + "pivot_files/month_scores", self.date_to_user_score)
Ejemplo n.º 14
0
    def active_users(self):
        start_date_obj = parse("2008-08-01T00:00:00.000")
        end_date_obj = parse("2021-01-01T00:00:00.000")
        m_delta = relativedelta(months=1)
        date_strings = list()
        questions_rec = list()
        answer_rec = list()
        post_users_rec = list()
        score_users_rec = list()
        year_users_dict = dict()

        # creating the dates for the plot
        current_date_obj = start_date_obj
        while current_date_obj < end_date_obj:
            date_strings.append(current_date_obj)
            current_date_obj += m_delta
        m_dates = matplotlib.dates.date2num(date_strings)

        while start_date_obj < end_date_obj:
            year_month_str = str(start_date_obj.year) + "-" + str(
                start_date_obj.month)
            # the score and tags for each user(uid) for every month
            month_user_scores = tools.load_pickle(self.path +
                                                  "Month_Analysis/Scores/" +
                                                  year_month_str)
            month_posts = tools.load_pickle(self.path +
                                            "Month_Analysis/Posts/" +
                                            year_month_str)
            m_questions = 0
            m_answers = 0
            m_active_users = set()
            for record in month_posts:
                if record[0] == '1':
                    m_questions += 1
                if record[0] == '2':
                    m_answers += 1
                m_active_users.add(record[1])
            questions_rec.append(m_questions)
            answer_rec.append(m_answers)
            post_users_rec.append(len(m_active_users))
            score_users_rec.append(len(month_user_scores))
            if str(start_date_obj.year) in year_users_dict:
                year_users_dict[str(start_date_obj.year)] = \
                    year_users_dict[str(start_date_obj.year)].union(m_active_users)
            else:
                year_users_dict[str(start_date_obj.year)] = m_active_users

            start_date_obj += m_delta

        tools.save_pickle(self.path + "year_active_users", year_users_dict)

        fig, ax1 = plt.subplots()
        color = '#000000'
        ax1.set_xlabel('Date')
        ax1.set_ylabel('Number of Posts', color=color)
        ax1.plot_date(m_dates,
                      questions_rec,
                      'None',
                      color="tab:red",
                      label="Users made a Post")
        ax1.plot_date(m_dates,
                      questions_rec,
                      'None',
                      color="tab:orange",
                      label="Users received Score")
        ax1.plot_date(m_dates,
                      questions_rec,
                      'b-',
                      color="tab:blue",
                      label="Questions")
        ax1.plot_date(m_dates,
                      answer_rec,
                      'b-',
                      color="tab:green",
                      label="Answers")
        ax1.tick_params(axis='y', labelcolor=color)
        plt.legend()
        axes = plt.gca()
        axes.set_ylim([0, 330000])
        ax2 = ax1.twinx(
        )  # instantiate a second axes that shares the same x-axis

        color = 'tab:red'
        ax2.set_ylabel('Active Users',
                       color=color)  # we already handled the x-label with ax1
        ax2.plot_date(m_dates,
                      post_users_rec,
                      'b-',
                      color="tab:red",
                      label="Users made a Post")
        ax2.plot_date(m_dates,
                      score_users_rec,
                      'b-',
                      color="tab:orange",
                      label="Users received Score")
        axes = plt.gca()
        axes.set_ylim([0, 440000])
        ax2.tick_params(axis='y', labelcolor=color)

        fig.tight_layout()  # otherwise the right y-label is slightly clipped

        plt.title("Posts and Active Users per Month")
        plt.xticks(rotation=45)

        plt.savefig(self.path + "users.png",
                    bbox_inches='tight',
                    format="png",
                    dpi=300)