Ejemplo n.º 1
0
        s = str(elem)
    except UnicodeEncodeError:
        s = s.encode("utf-8")
    except:
        return ""
    return s.replace("\t", " ").replace("\n", " ")


dirname = argv[1]


def write_tsv_row(f, l):
    stringified = [clean_elem(x) for x in l]
    f.write("\t".join(stringified) + "\n")


user_to_answers = {}

with open(dirname + '/Answers.xml') as qfile:
    for row, line in RowIter(qfile):
        if 'OwnerUserId' in row:
            user_id = row['OwnerUserId']
            if not user_id in user_to_answers:
                user_to_answers[user_id] = []
            user_to_answers[user_id].append(row['Id'])

with open(dirname + '/UserAnswers.tsv', 'w') as new_file:
    write_tsv_row(new_file, ['UserId', 'Answers'])
    for k, v in user_to_answers.iteritems():
        write_tsv_row(new_file, [k, ",".join(v)])
Ejemplo n.º 2
0
def write_tsv_row(f, l):
    stringified = [clean_elem(x) for x in l]
    f.write("\t".join(stringified) + "\n")

def parse_tags(tag_string):
    return tag_string.replace("><", ",").replace("<", "").replace(">", "")

def get_answer_age(creation_date):
    the_date, the_date2 = creation_date.split("T")
    year, month, day = the_date.split("-")
    hour, minute, second = the_date2.split(":")
    second1, second2 = second.split(".")
    #https://stackoverflow.com/a/151211/5187393
    cdate = datetime(int(year), int(month), int(day), int(hour), int(minute), int(second1), int(second2))
    delta = DATA_DUMP_DAY - cdate
    return str(delta.days*24*3600+delta.seconds)

with open(dirname +'/Answers.xml') as afile:
    with open(dirname + '/AnswerFeatures.tsv', 'w') as new_file:
        write_tsv_row(new_file, ['Id', 'ParentId', 'AnswerAge', 'Score', 'Body', 'OwnerUserId', 'CommentCount'])
        for row, line in RowIter(afile):
            if 'OwnerUserId' in row:
                the_id = row['Id']
                parent_id = row['ParentId']
                answer_age = get_answer_age(row['CreationDate'])
                score = row['Score']
                body = row['Body'] #text
                owner_user_id = row['OwnerUserId']
                comment_count = row['CommentCount']
                write_tsv_row(new_file, [the_id, parent_id, answer_age, score, body, owner_user_id, comment_count])
Ejemplo n.º 3
0
    try:
        s = str(elem)
    except UnicodeEncodeError:
        s = s.encode("utf-8")
    except:
        return ""
    return s.replace("\t", " ").replace("\n", " ")


dirname = argv[1]


def write_tsv_row(f, l):
    stringified = [clean_elem(x) for x in l]
    f.write("\t".join(stringified) + "\n")


user_to_badges = {}
with open(dirname + '/Badges.xml') as bfile:
    for row, line in RowIter(bfile):
        the_class = row['Class']
        user_id = row['UserId']
        if not user_id in user_to_badges:
            user_to_badges[user_id] = [0, 0, 0]
        user_to_badges[user_id][int(the_class) - 1] += 1

with open(dirname + '/UserBadges.tsv', 'w') as new_file:
    write_tsv_row(new_file, ['UserId', 'Gold', 'Silver', 'Bronze'])
    for k, v in user_to_badges.iteritems():
        write_tsv_row(new_file, [k, v[0], v[1], v[2]])
Ejemplo n.º 4
0
#Only users which have interacted with a question with an accepted answer
from rowiterator import RowIter
from sys import argv

dirname = argv[1]
user_ids = set()
with open(dirname +'/Questions.xml') as qfile:
    for row, line in RowIter(qfile):
        if 'OwnerUserId' in row:
            user_ids.add(row['OwnerUserId'])
        else:
            #print(line)
	    pass

with open(dirname +'/Answers.xml') as afile:
    for row, line in RowIter(afile):
        if 'OwnerUserId' in row:
            user_ids.add(row['OwnerUserId'])
        else:
            #print(line)
            pass

with open(dirname + '/Users.xml') as old_ufile:
    with open(dirname + '/RelUsers.xml', 'w') as new_ufile:
        for row, line in RowIter(old_ufile):
            if row['Id'] in user_ids:
                new_ufile.write(line + '\n')

Ejemplo n.º 5
0
def write_tsv_row(f, l):
    stringified = [clean_elem(x) for x in l]
    f.write("\t".join(stringified) + "\n")

def get_network_age(creation_date):
    the_date, _ = creation_date.split("T")
    year, month, day = the_date.split("-")
    #https://stackoverflow.com/a/151211/5187393
    cdate = date(int(year), int(month), int(day))
    delta = DATA_DUMP_DAY - cdate
    return str(delta.days)

with open(dirname +'/RelUsers.xml') as ufile:
    with open(dirname + '/UserFeatures.tsv', 'w') as new_file:
        write_tsv_row(new_file, ['Id', 'Reputation', 'LastAccessDate', 'Location', 'AboutMe', 'Views', 'NetworkAge', 'Age', 'UpVotes', 'DownVotes', 'Gold', 'Silver', 'Bronze', 'Questions', 'Answers'])
        for row, line in RowIter(ufile):
            the_id = row['Id']
            rep = row['Reputation']
            last_access_data = get_network_age(row['LastAccessDate'])
            try:
                about_me = row['AboutMe']
            except:
                about_me = ","
            try:
                location = row['Location']
            except:
                location = ","
            views = row['Views']
            nage = get_network_age(row['CreationDate'])
            age = -1
            if 'Age' in row:
Ejemplo n.º 6
0
from rowiterator import RowIter
from sys import argv

dirname = argv[1]
with open(dirname +'/Posts.xml') as infile:
    with open(dirname + '/Questions.xml', 'w') as qfile:
        with open(dirname + '/AllAnswers.xml', 'w') as afile:
            c = 0
            for row, line in RowIter(infile):
                if c % 1000 == 0:
                    print('Questions and answers', c)
                #if row['PostTypeId'] == '1' and 'AcceptedAnswerId' in row:
                if row['PostTypeId'] == '1':
                    qfile.write(line + '\n')
                elif row['PostTypeId'] == '2':
                    afile.write(line + '\n')
                else:
                    print line,'\n'
                c+=1