Exemple #1
0
def so_special(treatment_feature, extra_filter):
    """helper"""
    if extra_filter:
        qs1 = SampledStackOverflowPost.objects.filter(
            has_wiki_link=True, sample_num__in=[0, 1, 2],
            has_c_wiki_link=True).order_by('uid')
        qs2 = SampledStackOverflowPost.objects.filter(
            has_wiki_link=True,
            sample_num__in=[0, 1, 2],
            has_c_wiki_link=False).order_by('uid')
    else:
        qs1 = SampledStackOverflowPost.objects.filter(
            sample_num=0, has_wiki_link=True).order_by('uid')
        qs2 = SampledStackOverflowPost.objects.filter(
            sample_num=0, has_wiki_link=False).order_by('uid')

    treat_question_ids = []
    control_question_ids = []
    start_time = time.time()
    count = defaultdict(int)
    treat = []
    control = []

    for start, end, total, batch in batch_qs(qs1, batch_size=10000):
        print('qs1', start, end, total, time.time() - start_time)
        for obj in batch:
            ans = StackOverflowAnswer.objects.using('secondary').get(
                id=obj.uid)
            question_id = ans.parent_id
            if question_id not in treat_question_ids:
                treat.append(obj.num_pageviews)
                count['treatment_total'] += obj.num_pageviews
                count['treatment_count'] += 1
                treat_question_ids.append(question_id)

            else:
                count['dropped_treatment_total'] += obj.num_pageviews
                count['dropped_treatment_count'] += 1
    for start, end, total, batch in batch_qs(qs2, batch_size=10000):
        print('qs2', start, end, total, time.time() - start_time)
        for obj in batch:
            ans = StackOverflowAnswer.objects.using('secondary').get(
                id=obj.uid)
            question_id = ans.parent_id
            if question_id in treat_question_ids:
                count['dropped_control_total'] += obj.num_pageviews
                count['dropped_control_count'] += 1
                continue
            if question_id not in control_question_ids:
                control.append(obj.num_pageviews)
                count['control_total'] += obj.num_pageviews
                count['control_count'] += 1
                control_question_ids.append(question_id)
            else:
                count['dropped_control_total'] += obj.num_pageviews
                count['dropped_control_count'] += 1
    print(count)
    return treat, control
Exemple #2
0
def tags_frequency_distribution(qs):
    """
    Takes a qs and figure out which tags to links are found in

    This is more complicated than the generic frequency distribution
    because each post can have as many tags as desired by users
    """
    num_threads = qs.count()
    title = 'Identifying tag distribution for {} threads'.format(num_threads)
    print(title)
    tag_to_count = defaultdict(int)
    qs = qs.order_by('uid')

    # start, end, total
    for start, end, total, batch in batch_qs(qs, num_threads, 1000):
        print('Processing threads {} to {} of {}'.format(start, end, total))
        for thread in batch:
            tags = thread.tags_string.split('|')
            for tag in tags:
                tag_to_count[tag] += 1
    sorted_tag_to_count = sorted(tag_to_count.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)

    rows = []
    for i, val_tup in enumerate(sorted_tag_to_count[:25]):
        val = val_tup[0]
        count = tag_to_count[val]
        percent = count / num_threads * 100
        print(i, val_tup, percent)
        rows.append([i, val_tup, percent])
    with open(title, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerows(rows)
def bulk_save():
    """Runs through all the rows and re-saves to trigger
    computation"""
    reddit = SampledRedditThread.objects.all().order_by('uid')
    stack = SampledStackOverflowPost.objects.all().order_by('uid')

    start_time = time.time()
    for start, end, total, batch in batch_qs(reddit, batch_size=10000):
        print('reddit', start, end, total, time.time() - start_time)
        for item in batch:
            item.save()
    start = time.time()
    for start, end, total, batch in batch_qs(stack, batch_size=10000):
        print('stack', start, end, total, time.time() - start_time)
        for item in batch:
            item.save()
Exemple #4
0
 def get_method_outputs(qs):
     """Call the model method and return list of results"""
     vals = []
     qs = qs.order_by('uid')
     for _, _, _, batch in batch_qs(qs):
         for item in batch:
             vals.append(getattr(item, method_name)())
     return vals
def save_posts(sample_num=None):
    """
    Re compute Wiki link rows
    """
    print('saving posts... (slow)')
    if sample_num is None:
        sample_num = [0, 1, 2]
    else:
        sample_num = sample_num.split(',')
        sample_num = [int(x) for x in sample_num]
    reddit = SampledRedditThread.objects.filter(
        has_wiki_link=True, sample_num__in=sample_num).order_by('uid')
    for start, end, total, batch in batch_qs(reddit):
        print('reddit', start, end, total)
        for item in batch:
            item.save()
    stack = SampledStackOverflowPost.objects.filter(
        has_wiki_link=True, sample_num__in=sample_num).order_by('uid')
    for start, end, total, batch in batch_qs(stack):
        print('stack', start, end, total)
        for item in batch:
            item.save()
Exemple #6
0
def extract_vals_and_method_results(qs, field_names):
    """Extract either stored values or method results from a django QS"""
    rows = []
    for _, _, _, batch in batch_qs(qs, batch_size=1000):
        for obj in batch:
            row = []
            for field_name in field_names:
                try:
                    val = getattr(obj, field_name)()
                except TypeError:
                    val = getattr(obj, field_name)
                row.append(val)
            rows.append(row)
    return rows
Exemple #7
0
def frequency_distribution(qs, field, qs_name, extractor=None):
    """
    Takes a qs a figure out which base urls the links go to
    """
    num_threads = qs.count()
    title = 'Frequency Distribution of {} in subset "{}" ({} threads)'.format(
        field, qs_name, num_threads)
    filename = "{}_{}_{}.csv".format(field, qs_name, num_threads)
    print(title)
    val_to_count = defaultdict(int)
    qs = qs.order_by('uid')

    # start, end, total
    start_time = time.time()
    for start, end, total, batch in batch_qs(qs, num_threads, 10000):
        stamp = time.time()
        for thread in batch:
            vals = [getattr(thread, field)]
            if extractor is not None:
                vals = extractor(vals[0])
            for val in vals:
                val_to_count[val] += 1
        print('Finished threads {} to {} of {}. Took {}'.format(
            start, end, total,
            time.time() - stamp))
        print('Running time: {}'.format(time.time() - start_time))
        print(len(val_to_count.keys()))
    sorted_val_to_count = sorted(val_to_count.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)
    plot_bar(sorted_val_to_count[:20], title, filename)

    rows = []
    for i, val_tup in enumerate(sorted_val_to_count):
        count = val_to_count[val_tup[0]]
        percent = count / num_threads * 100
        print(i, val_tup, percent)
        rows.append([i, val_tup, percent])
    with open('csv_files/' + filename, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerows(rows)
def mark_top_answers():
    """marks top answers"""
    qs = SampledStackOverflowPost.objects.all().order_by('uid')
    for start, end, total, batch in batch_qs(qs, batch_size=10000):
        print(start, end, total)
        for answer in batch:
            try:
                question_id = StackOverflowAnswer.objects.using(
                    'secondary').filter(
                        id=answer.uid).values('parent_id')[0]['parent_id']
                other_answers = StackOverflowAnswer.objects.using(
                    'secondary').filter(parent_id=question_id)
                max_score = other_answers.aggregate(Max('score'))['score__max']
                print(max_score)
                if answer.score == max_score:
                    print('marking top answer as true woohoo!')
                    answer.is_top = True
                    answer.save()
            except Exception as err:
                print(err)
                print('MISSING QUESTION UH OH')
Exemple #9
0
def main(do_all=False):
    """driver"""
    reddit = praw.Reddit(client_id=os.environ["CLIENT_ID"],
                         client_secret=os.environ["CLIENT_SECRET"],
                         user_agent=os.environ["UA"])
    processor = give_author_processor(reddit)
    if do_all:
        print('Reprocessing all reddit authors')
        qs = SampledRedditThread.objects.all()
    else:
        print('will only process new samples')
        qs = SampledRedditThread.objects.filter(user_info_processed=False)

    qs = qs.order_by('uid')
    while qs.exists():
        start_time = time.time()
        for start, end, total, batch in batch_qs(qs):
            print(start, end, total, time.time() - start_time)
            for thread in batch:
                author_dict = processor(thread.author)
                for key, val in author_dict.items():
                    setattr(thread, key, val)
                thread.user_info_processed = True
                thread.save()