s = str(elem) except UnicodeEncodeError: s = s.encode("utf-8") except: return "" return s.replace("\t", " ").replace("\n", " ") dirname = argv[1] def write_tsv_row(f, l): stringified = [clean_elem(x) for x in l] f.write("\t".join(stringified) + "\n") user_to_answers = {} with open(dirname + '/Answers.xml') as qfile: for row, line in RowIter(qfile): if 'OwnerUserId' in row: user_id = row['OwnerUserId'] if not user_id in user_to_answers: user_to_answers[user_id] = [] user_to_answers[user_id].append(row['Id']) with open(dirname + '/UserAnswers.tsv', 'w') as new_file: write_tsv_row(new_file, ['UserId', 'Answers']) for k, v in user_to_answers.iteritems(): write_tsv_row(new_file, [k, ",".join(v)])
def write_tsv_row(f, l): stringified = [clean_elem(x) for x in l] f.write("\t".join(stringified) + "\n") def parse_tags(tag_string): return tag_string.replace("><", ",").replace("<", "").replace(">", "") def get_answer_age(creation_date): the_date, the_date2 = creation_date.split("T") year, month, day = the_date.split("-") hour, minute, second = the_date2.split(":") second1, second2 = second.split(".") #https://stackoverflow.com/a/151211/5187393 cdate = datetime(int(year), int(month), int(day), int(hour), int(minute), int(second1), int(second2)) delta = DATA_DUMP_DAY - cdate return str(delta.days*24*3600+delta.seconds) with open(dirname +'/Answers.xml') as afile: with open(dirname + '/AnswerFeatures.tsv', 'w') as new_file: write_tsv_row(new_file, ['Id', 'ParentId', 'AnswerAge', 'Score', 'Body', 'OwnerUserId', 'CommentCount']) for row, line in RowIter(afile): if 'OwnerUserId' in row: the_id = row['Id'] parent_id = row['ParentId'] answer_age = get_answer_age(row['CreationDate']) score = row['Score'] body = row['Body'] #text owner_user_id = row['OwnerUserId'] comment_count = row['CommentCount'] write_tsv_row(new_file, [the_id, parent_id, answer_age, score, body, owner_user_id, comment_count])
try: s = str(elem) except UnicodeEncodeError: s = s.encode("utf-8") except: return "" return s.replace("\t", " ").replace("\n", " ") dirname = argv[1] def write_tsv_row(f, l): stringified = [clean_elem(x) for x in l] f.write("\t".join(stringified) + "\n") user_to_badges = {} with open(dirname + '/Badges.xml') as bfile: for row, line in RowIter(bfile): the_class = row['Class'] user_id = row['UserId'] if not user_id in user_to_badges: user_to_badges[user_id] = [0, 0, 0] user_to_badges[user_id][int(the_class) - 1] += 1 with open(dirname + '/UserBadges.tsv', 'w') as new_file: write_tsv_row(new_file, ['UserId', 'Gold', 'Silver', 'Bronze']) for k, v in user_to_badges.iteritems(): write_tsv_row(new_file, [k, v[0], v[1], v[2]])
#Only users which have interacted with a question with an accepted answer from rowiterator import RowIter from sys import argv dirname = argv[1] user_ids = set() with open(dirname +'/Questions.xml') as qfile: for row, line in RowIter(qfile): if 'OwnerUserId' in row: user_ids.add(row['OwnerUserId']) else: #print(line) pass with open(dirname +'/Answers.xml') as afile: for row, line in RowIter(afile): if 'OwnerUserId' in row: user_ids.add(row['OwnerUserId']) else: #print(line) pass with open(dirname + '/Users.xml') as old_ufile: with open(dirname + '/RelUsers.xml', 'w') as new_ufile: for row, line in RowIter(old_ufile): if row['Id'] in user_ids: new_ufile.write(line + '\n')
def write_tsv_row(f, l): stringified = [clean_elem(x) for x in l] f.write("\t".join(stringified) + "\n") def get_network_age(creation_date): the_date, _ = creation_date.split("T") year, month, day = the_date.split("-") #https://stackoverflow.com/a/151211/5187393 cdate = date(int(year), int(month), int(day)) delta = DATA_DUMP_DAY - cdate return str(delta.days) with open(dirname +'/RelUsers.xml') as ufile: with open(dirname + '/UserFeatures.tsv', 'w') as new_file: write_tsv_row(new_file, ['Id', 'Reputation', 'LastAccessDate', 'Location', 'AboutMe', 'Views', 'NetworkAge', 'Age', 'UpVotes', 'DownVotes', 'Gold', 'Silver', 'Bronze', 'Questions', 'Answers']) for row, line in RowIter(ufile): the_id = row['Id'] rep = row['Reputation'] last_access_data = get_network_age(row['LastAccessDate']) try: about_me = row['AboutMe'] except: about_me = "," try: location = row['Location'] except: location = "," views = row['Views'] nage = get_network_age(row['CreationDate']) age = -1 if 'Age' in row:
from rowiterator import RowIter from sys import argv dirname = argv[1] with open(dirname +'/Posts.xml') as infile: with open(dirname + '/Questions.xml', 'w') as qfile: with open(dirname + '/AllAnswers.xml', 'w') as afile: c = 0 for row, line in RowIter(infile): if c % 1000 == 0: print('Questions and answers', c) #if row['PostTypeId'] == '1' and 'AcceptedAnswerId' in row: if row['PostTypeId'] == '1': qfile.write(line + '\n') elif row['PostTypeId'] == '2': afile.write(line + '\n') else: print line,'\n' c+=1