Example #1
0
    def create_relations(self, year):
        start_time = time.time()
        with open(
                self.files_path + "Graphs/Relations/" + str(year) +
                "_users_relations.csv", "w") as rel_file:
            rel_file.write("Source,Target,Weight\n")
            user_tag_dict = tools.load_pickle(self.files_path +
                                              "year_scores/" + str(year))
            normalized_user_tag_dict = dict()
            for user_id, tag_list in user_tag_dict.items():
                normalized_user_tag_dict[user_id] = [
                    tag_list[0],
                    self.normalize_tag_score(copy.deepcopy(tag_list[1])),
                    set(tag_list[1].keys())
                ]
            all_users_ids = list(normalized_user_tag_dict.keys())
            print(len(all_users_ids))
            for idx, outer_user_id in enumerate(all_users_ids[:-1]):
                print(idx)
                for inner_user_id in all_users_ids[idx + 1:]:
                    usr_distance2 = self.users_distance2(
                        normalized_user_tag_dict[outer_user_id],
                        normalized_user_tag_dict[inner_user_id])

                    rel_file.write(
                        str(outer_user_id) + ',' + str(inner_user_id) + ',' +
                        str(2 - usr_distance2) + "\n")
        tools.save_pickle(self.files_path + "year_scores/normalized_2020",
                          normalized_user_tag_dict)
        print("execution time", time.time() - start_time)
Example #2
0
 def reform_post_lists(self):
     print("In reform post lists.")
     for filepath in glob.iglob(self.data_path + "Posts/posts_per_month/*"):
         post_dict = dict()
         file_name = filepath.split("/")[-1]
         print(file_name)
         post_list = tools.load_pickle(filepath)
         for post in post_list:
             post_dict[int(post[3])] = (post[0], post[1], post[2])
         tools.save_pickle(self.data_path + "pivot_files/reformed_posts/" + file_name, post_dict)
     print("reform post lists done.")
Example #3
0
    def process_scores(self):
        month_delta = relativedelta(months=1)
        current_date_obj = self.start_date_obj
        while current_date_obj < self.end_date_obj:
            date_string = str(current_date_obj.year) + "-" + str(
                current_date_obj.month)
            print(date_string)
            score_dict = tools.load_pickle(self.score_path + date_string)
            self.month_tag_scores[date_string] = self.slot_score(score_dict)

            current_date_obj += month_delta
        tools.save_pickle(self.score_path + "month_tag_scores",
                          self.month_tag_scores)
Example #4
0
    def parse_votes(self, year):
        # I will parse votes for each month
        date_list_dict = defaultdict(list)
        with open(self.folder_path + "Votes/Votes.xml") as xml_file:
            # ignoring meta info and the first six votes because they are on the
            # last day of July and we want one month windows.
            for i in range(8):
                next(xml_file)
            post_count = 0
            false_date = 0
            for line in xml_file:
                if post_count % 1000000 == 0:
                    print(post_count)
                    print(false_date)
                post_count += 1
                try:
                    vote_info = eTree.fromstring(line)
                    cur_date_obj = parse(vote_info.attrib["CreationDate"])
                    year_month = str(cur_date_obj.year) + "-" + str(
                        cur_date_obj.month)
                    false_date = year_month
                    # messy way to deal with but the dataset has unordered dates
                    if cur_date_obj.year != year:
                        continue
                    date_list_dict[year_month].append(
                        (vote_info.attrib["Id"], vote_info.attrib["PostId"],
                         vote_info.attrib["VoteTypeId"],
                         vote_info.attrib["CreationDate"]))

                except UnicodeDecodeError as ue:
                    encoded_line = line.encode("latin-1", "ignore")
                    vote_info = eTree.fromstring(encoded_line)
                    cur_date_obj = parse(vote_info.attrib["CreationDate"])
                    year_month = str(cur_date_obj.year) + "-" + str(
                        cur_date_obj.month)
                    if cur_date_obj.year != year:
                        continue
                    date_list_dict[year_month].append(
                        (vote_info.attrib["Id"], vote_info.attrib["PostId"],
                         vote_info.attrib["VoteTypeId"],
                         vote_info.attrib["CreationDate"]))

                except ParseError as pe:
                    print("Parse error occurred " + line)

            for date, vote_list in date_list_dict.items():
                tools.save_pickle(
                    self.folder_path + "Votes/votes_per_month/" + date,
                    vote_list)
            print("Total Number of votes: ", post_count)
Example #5
0
 def create_date_indexes(self):
     print("In create_date_indexes.")
     current_date_obj = self.start_date_obj
     month_delta = relativedelta(months=1)
     while current_date_obj < self.end_date_obj:
         date_string = str(current_date_obj.year) + "-" + str(current_date_obj.month)
         print(date_string)
         post_dict = tools.load_pickle(self.data_path + "Posts/posts_per_month/" + date_string)
         month_set = set()
         for post in post_dict:
             month_set.add(post[3])
         current_date_obj += month_delta
         tools.save_pickle(self.data_path + "pivot_files/date_to_postid/" + date_string, month_set)
     print("create_date_indexes done.")
Example #6
0
    def parse_posts(self):
        with open(self.folder_path + "Posts/Posts.xml") as xml_file:
            # ignoring meta info and the first six votes because they are on the
            # last day of July and we want one month windows.
            for i in range(8):
                next(xml_file)
            post_date_dict = defaultdict(list)
            post_count = 0
            for line in xml_file:
                if post_count % 10000 == 0:
                    print(post_count)
                post_count += 1
                try:
                    post_info = eTree.fromstring(line)

                except UnicodeDecodeError as ue:
                    encoded_line = line.encode("latin-1", "ignore")
                    post_info = eTree.fromstring(encoded_line)

                except ParseError as pe:
                    print("Parse error occurred " + line)
                    break

                if "OwnerUserId" in post_info.attrib:
                    owner = post_info.attrib["OwnerUserId"]
                else:
                    owner = -99
                post_date_obj = parse(post_info.attrib["CreationDate"])
                year_month = str(post_date_obj.year) + "-" + str(
                    post_date_obj.month)
                if post_info.attrib["PostTypeId"] == "1":
                    post_date_dict[year_month].append(
                        (post_info.attrib["PostTypeId"], owner,
                         post_info.attrib["Tags"], post_info.attrib["Id"]))

                elif post_info.attrib["PostTypeId"] == "2":
                    post_date_dict[year_month].append(
                        (post_info.attrib["PostTypeId"], owner,
                         post_info.attrib["ParentId"], post_info.attrib["Id"]))

        for year_month, post_list in post_date_dict.items():
            tools.save_pickle(
                self.folder_path + "Posts/posts_per_month/" + year_month,
                post_list)
Example #7
0
 def parse_users(self):
     with open(self.folder_path + "Users.xml") as xml_file:
         next(xml_file)
         next(xml_file)
         for line in xml_file:
             # print(line)
             try:
                 user_info = eTree.fromstring(line)
                 self.users_dict[user_info.attrib["Id"]] = (
                     user_info.attrib["Reputation"],
                     user_info.attrib["DisplayName"])
             except UnicodeDecodeError as ue:
                 encoded_line = line.encode("latin-1", "ignore")
                 user_info = eTree.fromstring(encoded_line)
                 self.users_dict[user_info.attrib["Id"]] = (
                     user_info.attrib["Reputation"],
                     user_info.attrib["DisplayName"])
             except ParseError as pe:
                 print("Parse error occurred " + line)
     tools.save_pickle(self.folder_path + "/Users.pickle", self.users_dict)
Example #8
0
    def merge_months(self, year, start_month, end_month):
        year_dict = dict()
        for month in range(start_month, end_month):
            month_dict_of_tags = tools.load_pickle(self.files_path +
                                                   "Scores/" + str(year) +
                                                   "-" + str(month))
            for user_id, tag_list in month_dict_of_tags.items():

                if user_id in year_dict:
                    year_dict[user_id][0] += month_dict_of_tags[user_id][0]
                    for tag, tag_score in month_dict_of_tags[user_id][1].items(
                    ):
                        if tag in year_dict[user_id][1]:
                            year_dict[user_id][1][tag] += tag_score
                        else:
                            year_dict[user_id][1][tag] = tag_score
                else:
                    year_dict[user_id] = tag_list
        tools.save_pickle(self.files_path + "year_scores/" + str(year),
                          year_dict)
Example #9
0
 def reform_votes(self):
     print("In reform votes.")
     current_date_obj = self.start_date_obj
     month_delta = relativedelta(months=1)
     while current_date_obj < self.end_date_obj:
         vote_date = str(current_date_obj.year) + "-" + str(current_date_obj.month)
         c_month_votes = tools.load_pickle(self.data_path + "Votes/votes_per_month/" + vote_date)
         reformed_votes = list()
         counter = 0
         for vote in c_month_votes:
             if counter % 10000 == 0:
                 print((counter / len(c_month_votes) * 100), "%")
             counter += 1
             for date_tuple in self.post_date_index:
                 if vote[1] in date_tuple[1]:
                     split_date = date_tuple[0].split('-')
                     temp_date_obj = datetime(int(split_date[0]), int(split_date[1]), 1)
                     reformed_votes.append((vote[0], vote[1], vote[2], temp_date_obj))
         reformed_votes.sort(key=itemgetter(3))
         tools.save_pickle(self.data_path + "pivot_files/reformed_votes/" + vote_date, reformed_votes)
         current_date_obj += month_delta
     print("Reform votes finished.")
Example #10
0
    def working_on_votes(self):
        current_date_obj = self.start_date_obj
        month_delta = relativedelta(months=1)
        while current_date_obj < self.end_date_obj:
            vote_date = str(current_date_obj.year) + "-" + str(current_date_obj.month)
            print(vote_date)
            c_month_votes = tools.load_pickle(self.data_path + "pivot_files/reformed_votes/" + vote_date)
            vote_count = 0
            start_time = time.time()
            # The votes are ordered based on the date of the posts they are placed
            # this way we only need to load each date_to_id_to_post file once per date.
            post_date = "2008-8"
            date_to_id_to_post = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + post_date)

            # We will collect the answers with their responding question date and process them
            # with the same approach we did with the answers.
            answer_collection = list()

            for vote in c_month_votes:
                if vote_count % 10000 == 0:
                    print("Percentage: ", vote_count/len(c_month_votes) * 100, "%")
                    print("Execution time", time.time() - start_time)
                    start_time = time.time()
                vote_count += 1
                if str(vote[3].year) + "-" + str(vote[3].month) != post_date:
                    post_date = str(vote[3].year) + "-" + str(vote[3].month)
                    date_to_id_to_post = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + post_date)
                post_tuple = date_to_id_to_post[int(vote[1])]
                # if question
                if post_tuple[0] == '1':
                    # if the post is not deleted
                    if post_tuple[1] != -99:
                        # getting a list of the posts tags
                        tag_list = self.parse_tags(post_tuple[2])
                        if post_tuple[1] in self.date_to_user_score[vote_date]:
                            if vote[2] == '2':
                                self.date_to_user_score[vote_date][post_tuple[1]][0] += 10
                                for tag in tag_list:
                                    if tag in self.date_to_user_score[vote_date][post_tuple[1]][1]:
                                        self.date_to_user_score[vote_date][post_tuple[1]][1][tag] += 10
                                    else:
                                        self.date_to_user_score[vote_date][post_tuple[1]][1][tag] = 10
                        else:
                            if vote[2] == '2':
                                temp_tag_dict = dict()
                                for tag in tag_list:
                                    temp_tag_dict[tag] = 10
                                self.date_to_user_score[vote_date][post_tuple[1]] = [10, temp_tag_dict]

                # if answer
                if post_tuple[0] == '2':
                    if post_tuple[1] != -99:
                        question_date = self.get_post_date(post_tuple[2])
                        if question_date != 'no_post':
                            split_date = question_date.split('-')
                            temp_date_obj = datetime(int(split_date[0]), int(split_date[1]), 1)
                            answer_collection.append((post_tuple[0], post_tuple[1], post_tuple[2],
                                                      vote[2], temp_date_obj))

            answer_collection.sort(key=itemgetter(4))
            question_date = "2008-8"
            date_to_id_to_question = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + question_date)
            for answer in answer_collection:
                if str(answer[4].year) + "-" + str(answer[4].month) != question_date:
                    question_date = str(answer[4].year) + "-" + str(answer[4].month)
                    date_to_id_to_question = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/"
                                                               + question_date)
                question_tuple = date_to_id_to_question[int(answer[2])]
                tag_list = self.parse_tags(question_tuple[2])
                if answer[1] in self.date_to_user_score[vote_date]:
                    if answer[3] == '1':
                        self.date_to_user_score[vote_date][answer[1]][0] += 15
                        for tag in tag_list:
                            if tag in self.date_to_user_score[vote_date][answer[1]][1]:
                                self.date_to_user_score[vote_date][answer[1]][1][tag] += 15
                            else:
                                self.date_to_user_score[vote_date][answer[1]][1][tag] = 15
                    if answer[3] == '2':
                        self.date_to_user_score[vote_date][answer[1]][0] += 10
                        for tag in tag_list:
                            if tag in self.date_to_user_score[vote_date][answer[1]][1]:
                                self.date_to_user_score[vote_date][answer[1]][1][tag] += 10
                            else:
                                self.date_to_user_score[vote_date][answer[1]][1][tag] = 10
                else:
                    if answer[3] == '1':
                        temp_tag_dict = dict()
                        for tag in tag_list:
                            temp_tag_dict[tag] = 15
                        self.date_to_user_score[vote_date][answer[1]] = [15, temp_tag_dict]
                    if answer[3] == '2':
                        temp_tag_dict = dict()
                        for tag in tag_list:
                            temp_tag_dict[tag] = 10
                        self.date_to_user_score[vote_date][answer[1]] = [10, temp_tag_dict]
            current_date_obj += month_delta
        tools.save_pickle(self.data_path + "pivot_files/month_scores", self.date_to_user_score)
Example #11
0
    def active_users(self):
        start_date_obj = parse("2008-08-01T00:00:00.000")
        end_date_obj = parse("2021-01-01T00:00:00.000")
        m_delta = relativedelta(months=1)
        date_strings = list()
        questions_rec = list()
        answer_rec = list()
        post_users_rec = list()
        score_users_rec = list()
        year_users_dict = dict()

        # creating the dates for the plot
        current_date_obj = start_date_obj
        while current_date_obj < end_date_obj:
            date_strings.append(current_date_obj)
            current_date_obj += m_delta
        m_dates = matplotlib.dates.date2num(date_strings)

        while start_date_obj < end_date_obj:
            year_month_str = str(start_date_obj.year) + "-" + str(
                start_date_obj.month)
            # the score and tags for each user(uid) for every month
            month_user_scores = tools.load_pickle(self.path +
                                                  "Month_Analysis/Scores/" +
                                                  year_month_str)
            month_posts = tools.load_pickle(self.path +
                                            "Month_Analysis/Posts/" +
                                            year_month_str)
            m_questions = 0
            m_answers = 0
            m_active_users = set()
            for record in month_posts:
                if record[0] == '1':
                    m_questions += 1
                if record[0] == '2':
                    m_answers += 1
                m_active_users.add(record[1])
            questions_rec.append(m_questions)
            answer_rec.append(m_answers)
            post_users_rec.append(len(m_active_users))
            score_users_rec.append(len(month_user_scores))
            if str(start_date_obj.year) in year_users_dict:
                year_users_dict[str(start_date_obj.year)] = \
                    year_users_dict[str(start_date_obj.year)].union(m_active_users)
            else:
                year_users_dict[str(start_date_obj.year)] = m_active_users

            start_date_obj += m_delta

        tools.save_pickle(self.path + "year_active_users", year_users_dict)

        fig, ax1 = plt.subplots()
        color = '#000000'
        ax1.set_xlabel('Date')
        ax1.set_ylabel('Number of Posts', color=color)
        ax1.plot_date(m_dates,
                      questions_rec,
                      'None',
                      color="tab:red",
                      label="Users made a Post")
        ax1.plot_date(m_dates,
                      questions_rec,
                      'None',
                      color="tab:orange",
                      label="Users received Score")
        ax1.plot_date(m_dates,
                      questions_rec,
                      'b-',
                      color="tab:blue",
                      label="Questions")
        ax1.plot_date(m_dates,
                      answer_rec,
                      'b-',
                      color="tab:green",
                      label="Answers")
        ax1.tick_params(axis='y', labelcolor=color)
        plt.legend()
        axes = plt.gca()
        axes.set_ylim([0, 330000])
        ax2 = ax1.twinx(
        )  # instantiate a second axes that shares the same x-axis

        color = 'tab:red'
        ax2.set_ylabel('Active Users',
                       color=color)  # we already handled the x-label with ax1
        ax2.plot_date(m_dates,
                      post_users_rec,
                      'b-',
                      color="tab:red",
                      label="Users made a Post")
        ax2.plot_date(m_dates,
                      score_users_rec,
                      'b-',
                      color="tab:orange",
                      label="Users received Score")
        axes = plt.gca()
        axes.set_ylim([0, 440000])
        ax2.tick_params(axis='y', labelcolor=color)

        fig.tight_layout()  # otherwise the right y-label is slightly clipped

        plt.title("Posts and Active Users per Month")
        plt.xticks(rotation=45)

        plt.savefig(self.path + "users.png",
                    bbox_inches='tight',
                    format="png",
                    dpi=300)