def create_relations(self, year): start_time = time.time() with open( self.files_path + "Graphs/Relations/" + str(year) + "_users_relations.csv", "w") as rel_file: rel_file.write("Source,Target,Weight\n") user_tag_dict = tools.load_pickle(self.files_path + "year_scores/" + str(year)) normalized_user_tag_dict = dict() for user_id, tag_list in user_tag_dict.items(): normalized_user_tag_dict[user_id] = [ tag_list[0], self.normalize_tag_score(copy.deepcopy(tag_list[1])), set(tag_list[1].keys()) ] all_users_ids = list(normalized_user_tag_dict.keys()) print(len(all_users_ids)) for idx, outer_user_id in enumerate(all_users_ids[:-1]): print(idx) for inner_user_id in all_users_ids[idx + 1:]: usr_distance2 = self.users_distance2( normalized_user_tag_dict[outer_user_id], normalized_user_tag_dict[inner_user_id]) rel_file.write( str(outer_user_id) + ',' + str(inner_user_id) + ',' + str(2 - usr_distance2) + "\n") tools.save_pickle(self.files_path + "year_scores/normalized_2020", normalized_user_tag_dict) print("execution time", time.time() - start_time)
def reform_post_lists(self): print("In reform post lists.") for filepath in glob.iglob(self.data_path + "Posts/posts_per_month/*"): post_dict = dict() file_name = filepath.split("/")[-1] print(file_name) post_list = tools.load_pickle(filepath) for post in post_list: post_dict[int(post[3])] = (post[0], post[1], post[2]) tools.save_pickle(self.data_path + "pivot_files/reformed_posts/" + file_name, post_dict) print("reform post lists done.")
def process_scores(self): month_delta = relativedelta(months=1) current_date_obj = self.start_date_obj while current_date_obj < self.end_date_obj: date_string = str(current_date_obj.year) + "-" + str( current_date_obj.month) print(date_string) score_dict = tools.load_pickle(self.score_path + date_string) self.month_tag_scores[date_string] = self.slot_score(score_dict) current_date_obj += month_delta tools.save_pickle(self.score_path + "month_tag_scores", self.month_tag_scores)
def parse_votes(self, year): # I will parse votes for each month date_list_dict = defaultdict(list) with open(self.folder_path + "Votes/Votes.xml") as xml_file: # ignoring meta info and the first six votes because they are on the # last day of July and we want one month windows. for i in range(8): next(xml_file) post_count = 0 false_date = 0 for line in xml_file: if post_count % 1000000 == 0: print(post_count) print(false_date) post_count += 1 try: vote_info = eTree.fromstring(line) cur_date_obj = parse(vote_info.attrib["CreationDate"]) year_month = str(cur_date_obj.year) + "-" + str( cur_date_obj.month) false_date = year_month # messy way to deal with but the dataset has unordered dates if cur_date_obj.year != year: continue date_list_dict[year_month].append( (vote_info.attrib["Id"], vote_info.attrib["PostId"], vote_info.attrib["VoteTypeId"], vote_info.attrib["CreationDate"])) except UnicodeDecodeError as ue: encoded_line = line.encode("latin-1", "ignore") vote_info = eTree.fromstring(encoded_line) cur_date_obj = parse(vote_info.attrib["CreationDate"]) year_month = str(cur_date_obj.year) + "-" + str( cur_date_obj.month) if cur_date_obj.year != year: continue date_list_dict[year_month].append( (vote_info.attrib["Id"], vote_info.attrib["PostId"], vote_info.attrib["VoteTypeId"], vote_info.attrib["CreationDate"])) except ParseError as pe: print("Parse error occurred " + line) for date, vote_list in date_list_dict.items(): tools.save_pickle( self.folder_path + "Votes/votes_per_month/" + date, vote_list) print("Total Number of votes: ", post_count)
def create_date_indexes(self): print("In create_date_indexes.") current_date_obj = self.start_date_obj month_delta = relativedelta(months=1) while current_date_obj < self.end_date_obj: date_string = str(current_date_obj.year) + "-" + str(current_date_obj.month) print(date_string) post_dict = tools.load_pickle(self.data_path + "Posts/posts_per_month/" + date_string) month_set = set() for post in post_dict: month_set.add(post[3]) current_date_obj += month_delta tools.save_pickle(self.data_path + "pivot_files/date_to_postid/" + date_string, month_set) print("create_date_indexes done.")
def parse_posts(self): with open(self.folder_path + "Posts/Posts.xml") as xml_file: # ignoring meta info and the first six votes because they are on the # last day of July and we want one month windows. for i in range(8): next(xml_file) post_date_dict = defaultdict(list) post_count = 0 for line in xml_file: if post_count % 10000 == 0: print(post_count) post_count += 1 try: post_info = eTree.fromstring(line) except UnicodeDecodeError as ue: encoded_line = line.encode("latin-1", "ignore") post_info = eTree.fromstring(encoded_line) except ParseError as pe: print("Parse error occurred " + line) break if "OwnerUserId" in post_info.attrib: owner = post_info.attrib["OwnerUserId"] else: owner = -99 post_date_obj = parse(post_info.attrib["CreationDate"]) year_month = str(post_date_obj.year) + "-" + str( post_date_obj.month) if post_info.attrib["PostTypeId"] == "1": post_date_dict[year_month].append( (post_info.attrib["PostTypeId"], owner, post_info.attrib["Tags"], post_info.attrib["Id"])) elif post_info.attrib["PostTypeId"] == "2": post_date_dict[year_month].append( (post_info.attrib["PostTypeId"], owner, post_info.attrib["ParentId"], post_info.attrib["Id"])) for year_month, post_list in post_date_dict.items(): tools.save_pickle( self.folder_path + "Posts/posts_per_month/" + year_month, post_list)
def parse_users(self): with open(self.folder_path + "Users.xml") as xml_file: next(xml_file) next(xml_file) for line in xml_file: # print(line) try: user_info = eTree.fromstring(line) self.users_dict[user_info.attrib["Id"]] = ( user_info.attrib["Reputation"], user_info.attrib["DisplayName"]) except UnicodeDecodeError as ue: encoded_line = line.encode("latin-1", "ignore") user_info = eTree.fromstring(encoded_line) self.users_dict[user_info.attrib["Id"]] = ( user_info.attrib["Reputation"], user_info.attrib["DisplayName"]) except ParseError as pe: print("Parse error occurred " + line) tools.save_pickle(self.folder_path + "/Users.pickle", self.users_dict)
def merge_months(self, year, start_month, end_month): year_dict = dict() for month in range(start_month, end_month): month_dict_of_tags = tools.load_pickle(self.files_path + "Scores/" + str(year) + "-" + str(month)) for user_id, tag_list in month_dict_of_tags.items(): if user_id in year_dict: year_dict[user_id][0] += month_dict_of_tags[user_id][0] for tag, tag_score in month_dict_of_tags[user_id][1].items( ): if tag in year_dict[user_id][1]: year_dict[user_id][1][tag] += tag_score else: year_dict[user_id][1][tag] = tag_score else: year_dict[user_id] = tag_list tools.save_pickle(self.files_path + "year_scores/" + str(year), year_dict)
def reform_votes(self): print("In reform votes.") current_date_obj = self.start_date_obj month_delta = relativedelta(months=1) while current_date_obj < self.end_date_obj: vote_date = str(current_date_obj.year) + "-" + str(current_date_obj.month) c_month_votes = tools.load_pickle(self.data_path + "Votes/votes_per_month/" + vote_date) reformed_votes = list() counter = 0 for vote in c_month_votes: if counter % 10000 == 0: print((counter / len(c_month_votes) * 100), "%") counter += 1 for date_tuple in self.post_date_index: if vote[1] in date_tuple[1]: split_date = date_tuple[0].split('-') temp_date_obj = datetime(int(split_date[0]), int(split_date[1]), 1) reformed_votes.append((vote[0], vote[1], vote[2], temp_date_obj)) reformed_votes.sort(key=itemgetter(3)) tools.save_pickle(self.data_path + "pivot_files/reformed_votes/" + vote_date, reformed_votes) current_date_obj += month_delta print("Reform votes finished.")
def working_on_votes(self): current_date_obj = self.start_date_obj month_delta = relativedelta(months=1) while current_date_obj < self.end_date_obj: vote_date = str(current_date_obj.year) + "-" + str(current_date_obj.month) print(vote_date) c_month_votes = tools.load_pickle(self.data_path + "pivot_files/reformed_votes/" + vote_date) vote_count = 0 start_time = time.time() # The votes are ordered based on the date of the posts they are placed # this way we only need to load each date_to_id_to_post file once per date. post_date = "2008-8" date_to_id_to_post = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + post_date) # We will collect the answers with their responding question date and process them # with the same approach we did with the answers. answer_collection = list() for vote in c_month_votes: if vote_count % 10000 == 0: print("Percentage: ", vote_count/len(c_month_votes) * 100, "%") print("Execution time", time.time() - start_time) start_time = time.time() vote_count += 1 if str(vote[3].year) + "-" + str(vote[3].month) != post_date: post_date = str(vote[3].year) + "-" + str(vote[3].month) date_to_id_to_post = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + post_date) post_tuple = date_to_id_to_post[int(vote[1])] # if question if post_tuple[0] == '1': # if the post is not deleted if post_tuple[1] != -99: # getting a list of the posts tags tag_list = self.parse_tags(post_tuple[2]) if post_tuple[1] in self.date_to_user_score[vote_date]: if vote[2] == '2': self.date_to_user_score[vote_date][post_tuple[1]][0] += 10 for tag in tag_list: if tag in self.date_to_user_score[vote_date][post_tuple[1]][1]: self.date_to_user_score[vote_date][post_tuple[1]][1][tag] += 10 else: self.date_to_user_score[vote_date][post_tuple[1]][1][tag] = 10 else: if vote[2] == '2': temp_tag_dict = dict() for tag in tag_list: temp_tag_dict[tag] = 10 self.date_to_user_score[vote_date][post_tuple[1]] = [10, temp_tag_dict] # if answer if post_tuple[0] == '2': if post_tuple[1] != -99: question_date = self.get_post_date(post_tuple[2]) if question_date != 'no_post': split_date = question_date.split('-') temp_date_obj = datetime(int(split_date[0]), int(split_date[1]), 1) answer_collection.append((post_tuple[0], post_tuple[1], post_tuple[2], vote[2], temp_date_obj)) answer_collection.sort(key=itemgetter(4)) question_date = "2008-8" date_to_id_to_question = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + question_date) for answer in answer_collection: if str(answer[4].year) + "-" + str(answer[4].month) != question_date: question_date = str(answer[4].year) + "-" + str(answer[4].month) date_to_id_to_question = tools.load_pickle(self.data_path + "pivot_files/reformed_posts/" + question_date) question_tuple = date_to_id_to_question[int(answer[2])] tag_list = self.parse_tags(question_tuple[2]) if answer[1] in self.date_to_user_score[vote_date]: if answer[3] == '1': self.date_to_user_score[vote_date][answer[1]][0] += 15 for tag in tag_list: if tag in self.date_to_user_score[vote_date][answer[1]][1]: self.date_to_user_score[vote_date][answer[1]][1][tag] += 15 else: self.date_to_user_score[vote_date][answer[1]][1][tag] = 15 if answer[3] == '2': self.date_to_user_score[vote_date][answer[1]][0] += 10 for tag in tag_list: if tag in self.date_to_user_score[vote_date][answer[1]][1]: self.date_to_user_score[vote_date][answer[1]][1][tag] += 10 else: self.date_to_user_score[vote_date][answer[1]][1][tag] = 10 else: if answer[3] == '1': temp_tag_dict = dict() for tag in tag_list: temp_tag_dict[tag] = 15 self.date_to_user_score[vote_date][answer[1]] = [15, temp_tag_dict] if answer[3] == '2': temp_tag_dict = dict() for tag in tag_list: temp_tag_dict[tag] = 10 self.date_to_user_score[vote_date][answer[1]] = [10, temp_tag_dict] current_date_obj += month_delta tools.save_pickle(self.data_path + "pivot_files/month_scores", self.date_to_user_score)
def active_users(self): start_date_obj = parse("2008-08-01T00:00:00.000") end_date_obj = parse("2021-01-01T00:00:00.000") m_delta = relativedelta(months=1) date_strings = list() questions_rec = list() answer_rec = list() post_users_rec = list() score_users_rec = list() year_users_dict = dict() # creating the dates for the plot current_date_obj = start_date_obj while current_date_obj < end_date_obj: date_strings.append(current_date_obj) current_date_obj += m_delta m_dates = matplotlib.dates.date2num(date_strings) while start_date_obj < end_date_obj: year_month_str = str(start_date_obj.year) + "-" + str( start_date_obj.month) # the score and tags for each user(uid) for every month month_user_scores = tools.load_pickle(self.path + "Month_Analysis/Scores/" + year_month_str) month_posts = tools.load_pickle(self.path + "Month_Analysis/Posts/" + year_month_str) m_questions = 0 m_answers = 0 m_active_users = set() for record in month_posts: if record[0] == '1': m_questions += 1 if record[0] == '2': m_answers += 1 m_active_users.add(record[1]) questions_rec.append(m_questions) answer_rec.append(m_answers) post_users_rec.append(len(m_active_users)) score_users_rec.append(len(month_user_scores)) if str(start_date_obj.year) in year_users_dict: year_users_dict[str(start_date_obj.year)] = \ year_users_dict[str(start_date_obj.year)].union(m_active_users) else: year_users_dict[str(start_date_obj.year)] = m_active_users start_date_obj += m_delta tools.save_pickle(self.path + "year_active_users", year_users_dict) fig, ax1 = plt.subplots() color = '#000000' ax1.set_xlabel('Date') ax1.set_ylabel('Number of Posts', color=color) ax1.plot_date(m_dates, questions_rec, 'None', color="tab:red", label="Users made a Post") ax1.plot_date(m_dates, questions_rec, 'None', color="tab:orange", label="Users received Score") ax1.plot_date(m_dates, questions_rec, 'b-', color="tab:blue", label="Questions") ax1.plot_date(m_dates, answer_rec, 'b-', color="tab:green", label="Answers") ax1.tick_params(axis='y', labelcolor=color) plt.legend() axes = plt.gca() axes.set_ylim([0, 330000]) ax2 = ax1.twinx( ) # instantiate a second axes that shares the same x-axis color = 'tab:red' ax2.set_ylabel('Active Users', color=color) # we already handled the x-label with ax1 ax2.plot_date(m_dates, post_users_rec, 'b-', color="tab:red", label="Users made a Post") ax2.plot_date(m_dates, score_users_rec, 'b-', color="tab:orange", label="Users received Score") axes = plt.gca() axes.set_ylim([0, 440000]) ax2.tick_params(axis='y', labelcolor=color) fig.tight_layout() # otherwise the right y-label is slightly clipped plt.title("Posts and Active Users per Month") plt.xticks(rotation=45) plt.savefig(self.path + "users.png", bbox_inches='tight', format="png", dpi=300)