Esempio n. 1
0
def questionid_answererid(cate_name):
	questionid_answerid_file = os.path.join(cate_name,"AnswerId_QuestionId.csv")
	q_answer_df = pd.read_csv(questionid_answerid_file)

	answerId_AnswererId_file = os.path.join(cate_name,"AnswerId_AnswererId.csv")
	a_a_df = pd.read_csv(answerId_AnswererId_file)
	a_a_dict = dict(zip(a_a_df["AnswerId"],a_a_df["AnswererId"]))

	ques = []
	answer = []
	answerer = []
	score = []
	creation_date = []

	for q,a,s,c_d in zip(q_answer_df["QuestionId"],q_answer_df["AnswerId"],q_answer_df["Score"],q_answer_df["CreationDate"]):
		if a in a_a_dict:
			ques.append(q)
			answer.append(a)
			answerer.append(a_a_dict[a])
			score.append(s)
			creation_date.append(c_d)

	df = pd.DataFrame({"QuestionId":ques,"AnswerId":answer,"AnswererId":answerer,"Score":score,"CreationDate":creation_date})
	df.to_csv(os.path.join(cate_name,"QuestionId_AnswererId.csv"),index = True, columns = ["QuestionId","AnswerId","AnswererId","Score","CreationDate"])
	sprint(cate_name,"pystack_analysis.log","# question-answer-answerer pairs: %d" % len(df))
Esempio n. 2
0
def askerid_answererid(cate_name):
	questionid_askerid_file = os.path.join(cate_name,"QuestionId_AskerId.csv")
	q_asker_df = pd.read_csv(questionid_askerid_file)
	q_asker_dict = {}

	for q,a,q_c_d in zip(q_asker_df["QuestionId"],q_asker_df["AskerId"],q_asker_df["CreationDate"]):
		q_asker_dict[q] = [a,q_c_d]

	q_AnswererId_file = os.path.join(cate_name,"QuestionId_AnswererId.csv")
	q_answerer_df = pd.read_csv(q_AnswererId_file)

	ques = []
	asker = []
	answerer = []
	score = []
	q_creation_date = []
	answer_creation_date = []

	for q,a,s,c_d in zip(q_answerer_df["QuestionId"],q_answerer_df["AnswererId"],q_answerer_df["Score"],q_answerer_df["CreationDate"]):
		if q in q_asker_dict:
			ques.append(q)
			asker.append(q_asker_dict[q][0])
			answerer.append(a)
			score.append(s)
			q_creation_date.append(q_asker_dict[q][1])
			answer_creation_date.append(c_d)

	df = pd.DataFrame({"QuestionId":ques,"AskerId":asker,"AnswererId":answerer,"Score":score,"QuestionCreationDate":q_creation_date,"AnswerCreationDate":answer_creation_date})
	df.to_csv(os.path.join(cate_name,"AskerId_AnswererId.csv"),index = True, columns = ["QuestionId","AskerId","AnswererId","Score","QuestionCreationDate","AnswerCreationDate"])
	sprint(cate_name,"pystack_analysis.log","# question-asker-answerer pairs: %d" % len(df))
Esempio n. 3
0
def process_answer_body(dir_path,Answers):
	answers_file = os.path.join(dir_path,"Answers.pkl")
	answers_body = dict(zip(Answers["AnswerId"],Answers["Body"]))

	sprint(dir_path,"pystack_analysis.log","# answers with body: %d" % len(answers_body))
	with open(answers_file,"wb") as f:
		pickle.dump(answers_body,f)
Esempio n. 4
0
def duplicte_questions(cate_name):
    df = pd.read_csv(os.path.join(cate_name, "PostId_RelatedPostId.csv"))
    result = df.groupby("LinkTypeId")
    q1 = []
    q2 = []
    for k, v in result:
        d = pd.DataFrame({
            "QuestionId": v["PostId"],
            "RelatedQuestionId": v["RelatedPostId"]
        })
        if k == 3:
            print("Processing duplicate questions...")
            sprint(cate_name, "pystack_analysis.log",
                   "# duplicate questions: %d" % len(d))
            file_name = "Duplicate_Questions.csv"
        if k == 1:
            print("Processing related questions...")
            sprint(cate_name, "pystack_analysis.log",
                   "# related questions: %d" % len(d))
            file_name = "Related_Questions_Source2Target.csv"

        d.to_csv(os.path.join(cate_name, file_name),
                 index=False,
                 columns=["QuestionId", "RelatedQuestionId"])
        print("file saved to: %s" % file_name)
    print("***********************************")
Esempio n. 5
0
def process_question_text(dir_path,Questions):
	questions_file = os.path.join(dir_path,"Questions.pkl")
	questions_dict = {}
	for question,title,body in zip(Questions["QuestionId"],Questions["Title"],Questions["Body"]):
		questions_dict[question] = [title,body]

	sprint(dir_path,"pystack_analysis.log","# questions with title and body: %d" % len(questions_dict))

	with open(questions_file,"wb") as f:
		pickle.dump(questions_dict,f)
Esempio n. 6
0
def process_QuestionId_AcceptedAnswerId(output_dir,Questions):
	QuestionId = []
	AcceptedAnswerId = []
	for qid,aid in zip(Questions["QuestionId"],Questions["AcceptedAnswerId"]):
		if aid:
			QuestionId.append(qid)
			AcceptedAnswerId.append(aid)
	df = pd.DataFrame({"QuestionId":QuestionId,"AcceptedAnswerId":AcceptedAnswerId})
	output_file = os.path.join(output_dir,"QuestionId_AcceptedAnswerId.csv")
	df.to_csv(output_file,index = True, columns = ["QuestionId","AcceptedAnswerId"])
	sprint(output_dir,"pystack_analysis.log","# question-acceptedAnswer pairs: %d" % len(df))
Esempio n. 7
0
def comments_processing(input_file):
    d = {
        "PostId": [],
        "UserId": [],
        "Score": [],
        "Text": [],
        "CreationDate": []
    }
    for event, elem in ET.iterparse(input_file):
        if event == "end":
            try:
                postid = int(elem.attrib["PostId"])
                userid = int(elem.attrib["UserId"])
                score = int(elem.attrib["Score"])
                creationdate = elem.attrib["CreationDate"]
                text = elem.attrib["Text"]

                d["PostId"].append(postid)
                d["UserId"].append(userid)
                d["Score"].append(score)
                d["CreationDate"].append(creationdate)
                d["Text"].append(text)
                #print elem.tag,elem.attrib
                elem.clear()
            except Exception as e:
                pass
                #print e
    assert len(d["PostId"]) == len(d["UserId"]) and len(d["UserId"]) == len(
        d["Score"]) and len(d["Score"]) == len(d["CreationDate"]) and len(
            d["Score"]) == len(d["Text"])

    file_dir = os.path.dirname(os.path.abspath(input_file))
    postid_userid_file = os.path.join(file_dir, "PostId_CommenterId.csv")
    comments_file = os.path.join(file_dir, "PostId_CommenterId_Text.pkl")

    df1 = pd.DataFrame(d)
    df1.to_csv(postid_userid_file,
               index=False,
               columns=["PostId", "UserId", "Score"])
    print("output file: %s" % postid_userid_file)
    sprint(file_dir, "pystack_analysis.log",
           "# PostId-Comment pairs: %d" % len(df1))
    print("output file: %s" % comments_file)
    with open(comments_file, "wb") as f:
        pickle.dump(d, f)
Esempio n. 8
0
def questionid_bestanswererid(cate_name):
	questionid_bestanswerid_file = os.path.join(cate_name,"QuestionId_AcceptedAnswerId.csv")
	q_b_df = pd.read_csv(questionid_bestanswerid_file)
	answerId_AnswererId_file = os.path.join(cate_name,"AnswerId_AnswererId.csv")
	a_a_df = pd.read_csv(answerId_AnswererId_file)
	q_b_dict = dict(zip(q_b_df["QuestionId"],q_b_df["AcceptedAnswerId"]))
	a_a_dict = dict(zip(a_a_df["AnswerId"],a_a_df["AnswererId"]))
	q_l = []
	a_l = []
	aer_l = []
	for q,b in q_b_dict.iteritems():
		if b in a_a_dict:
			q_l.append(q)
			a_l.append(b)
			aer_l.append(a_a_dict[b]) 

	q_ber_df = pd.DataFrame({"QuestionId":q_l,"AcceptedAnswerId":a_l,"AcceptedAnswererId":aer_l})
	q_ber_df.to_csv(os.path.join(cate_name,"QuestionId_AcceptedAnswererId.csv"),index = True, columns = ["QuestionId","AcceptedAnswerId","AcceptedAnswererId"])
	sprint(cate_name,"pystack_analysis.log","# question-bestAnswerer pairs: %d" % len(q_ber_df))
Esempio n. 9
0
def badges_processing(file_name):
    index = []
    UserId = []
    BadgeName = []
    BadgeDate = []

    for event, elem in ET.iterparse(file_name):
        if event == "end":
            try:
                #print elem.tag,elem.attrib
                ind = int(elem.attrib["Id"])
                userid = int(elem.attrib["UserId"])
                badgename = elem.attrib["Name"]
                badgedate = elem.attrib["Date"]
                #print ind,userid,badgename,badgedate
                index.append(ind)
                UserId.append(userid)
                BadgeName.append(badgename)
                BadgeDate.append(badgedate)
                elem.clear()
            except Exception as e:
                pass

    dir_path = os.path.dirname(os.path.abspath(file_name))
    output_file = os.path.join(dir_path, "Badges.csv")
    print("output file: %s" % output_file)
    df = pd.DataFrame({
        "UserId": UserId,
        "BadgeName": BadgeName,
        "BadgeDate": BadgeDate
    })
    df.to_csv(output_file,
              index=False,
              columns=["UserId", "BadgeName", "BadgeDate"])
    sprint(dir_path, "pystack_analysis.log",
           "# users having badges: %d" % len(df))
Esempio n. 10
0
def process_question_tags(output_dir,Questions):
	df = pd.DataFrame({"QuestionId":Questions["QuestionId"],"Tags":Questions["Tags"]})
	tags_set = []
	question_tags = {}
	for q,t in zip(df["QuestionId"],df["Tags"]):
		tags = [tag[1:] for tag in t.split(">") if len(tag) > 0]
		tags_set += tags
		question_tags[q] = tags
	
	sprint(output_dir,"pystack_analysis.log","# question with tags: %d" % len(question_tags))
	sprint(output_dir,"pystack_analysis.log","# tags per question: %0.4f" % (sum([len(x) for _,x in question_tags.iteritems()])*1.0/len(question_tags)))
	sprint(output_dir,"pystack_analysis.log","# unique tags: %d" % len(set(tags_set)))
	
	with open(os.path.join(output_dir,"question_tags.pkl"),"wb") as f:
		pickle.dump(question_tags,f)
Esempio n. 11
0
def process_AnswerId_AnswererId(output_dir,Answers):

	output_file = os.path.join(output_dir,"AnswerId_AnswererId.csv")
	df = pd.DataFrame({"AnswerId":Answers["AnswerId"],"AnswererId":Answers["OwnerUserId"]})
	df.to_csv(output_file,index = True, columns = ["AnswerId","AnswererId"])
	sprint(output_dir,"pystack_analysis.log","# answer-answerer pairs: %d" % len(df))
Esempio n. 12
0
def process_AnswerId_QuestionId(output_dir,Answers):

	output_file = os.path.join(output_dir,"AnswerId_QuestionId.csv")
	df = pd.DataFrame({"QuestionId":Answers["QuestionId"],"AnswerId":Answers["AnswerId"],"Score":Answers["Score"],"CreationDate":Answers["CreationDate"]})
	df.to_csv(output_file,index = True, columns = ["QuestionId","AnswerId","Score","CreationDate"])
	sprint(output_dir,"pystack_analysis.log","# question-answer pairs: %d" % len(df))
Esempio n. 13
0
def process_QuestionId_AskerId(output_dir,Questions):
	output_file = os.path.join(output_dir,"QuestionId_AskerId.csv")
	df = pd.DataFrame({"QuestionId":Questions["QuestionId"],"AskerId":Questions["OwnerUserId"],"CreationDate":Questions["CreationDate"]})
	df.to_csv(output_file,index = True, columns = ["QuestionId","AskerId","CreationDate"])
	sprint(output_dir,"pystack_analysis.log","# question-asker pairs: %d" % len(df))