Example #1
0
def download(doc_id):
    try:
        logger.debug("Download %s", doc_id)
        doc = mongo.find_document('samediff', doc_id)
        headers = [
            _('word'),
            _('uses in') + ' ' + doc['filenames'][0],
            _('uses in') + ' ' + doc['filenames'][1],
            _('total uses')
        ]
        rows = []
        for f, w in doc['sameWords']:
            doc1Count = next(f2 for f2, w2 in doc['mostFrequentDoc1']
                             if w == w2)
            doc2Count = next(f2 for f2, w2 in doc['mostFrequentDoc2']
                             if w == w2)
            rows.append([w, doc1Count, doc2Count, f])
        for f, w in doc['diffWordsDoc1']:
            rows.append([w, f, 0, f])
        for f, w in doc['diffWordsDoc1']:
            rows.append([w, 0, f, f])
        # TODO: clean up file name
        file_path = filehandler.write_to_csv(
            headers, rows,
            filehandler.generate_filename('csv', '', doc['filenames'][0],
                                          doc['filenames'][1]), False)
        logger.debug('  created csv to download at %s', file_path)
        return filehandler.generate_csv(file_path)
    except Exception as e:
        logging.exception(e)
        abort(400)
Example #2
0
def download_gexf(doc_id):
    """
    Download GEXF file
    """
    logger.info('[CTD] Requesting GEXF for doc: %s', doc_id)
    doc = mongo.find_document('connectthedots', doc_id)
    return Response(doc.get('results')['gexf'], mimetype='application/xml')
Example #3
0
def results(doc_id):

    remaining_days = None

    try:
        job = mongo.find_document('samediff', doc_id)
        if job['sample_id'] == u'':
            remaining_days = mongo.get_remaining_days('samediff', doc_id)
    except:
        logger.warning("Unable to find doc '%s'", doc_id)
        return render_template('no_results.html', tool_name='samediff')

    whatnext = {}
    whatnext['most_common_word'] = job['sameWords'][0][1] if len(job['sameWords']) > 0 else ''
    whatnext['second_most_common_word'] = job['sameWords'][1][1] if len(job['sameWords']) > 1 else ''
    whatnext['doc2_most_common_word'] = job['diffWordsDoc2'][0][1] if len(job['diffWordsDoc2']) > 0 else ''

    if(job['totalWordsDoc1'] > job['totalWordsDoc2']):
        pct_length_diff = float(job['totalWordsDoc1'] - job['totalWordsDoc2']) / float(job['totalWordsDoc1'])
    else:
        pct_length_diff = float(job['totalWordsDoc2'] - job['totalWordsDoc1']) / float(job['totalWordsDoc2'])

    return render_template('samediff/results.html', results=job, 
        pct_length_diff = pct_length_diff,
        cosine_similarity= {'score':job['cosineSimilarity'],'description':interpretCosineSimilarity(job['cosineSimilarity'])},
        whatnext=whatnext, tool_name='samediff', doc_id=doc_id,
        remaining_days=remaining_days)
Example #4
0
def results(doc_id):

    remaining_days = None

    try:
        job = mongo.find_document('samediff', doc_id)
        if job['sample_id'] == u'':
            remaining_days = mongo.get_remaining_days('samediff', doc_id)
    except:
        logger.warning("Unable to find doc '%s'", doc_id)
        return render_template('no_results.html', tool_name='samediff')

    whatnext = {}
    whatnext['most_common_word'] = job['sameWords'][0][1] if len(job['sameWords']) > 0 else ''
    whatnext['second_most_common_word'] = job['sameWords'][1][1] if len(job['sameWords']) > 1 else ''
    whatnext['doc2_most_common_word'] = job['diffWordsDoc2'][0][1] if len(job['diffWordsDoc2']) > 0 else ''

    if(job['totalWordsDoc1'] > job['totalWordsDoc2']):
        pct_length_diff = float(job['totalWordsDoc1'] - job['totalWordsDoc2']) / float(job['totalWordsDoc1'])
    else:
        pct_length_diff = float(job['totalWordsDoc2'] - job['totalWordsDoc1']) / float(job['totalWordsDoc2'])

    return render_template('samediff/results.html', results=job, 
        pct_length_diff = pct_length_diff,
        cosine_similarity= {'score':job['cosineSimilarity'],'description':interpretCosineSimilarity(job['cosineSimilarity'])},
        whatnext=whatnext, tool_name='samediff', doc_id=doc_id,
        remaining_days=remaining_days)
Example #5
0
def download_gexf(doc_id):
    """
    Download GEXF file
    """
    logger.info('[CTD] Requesting GEXF for doc: %s', doc_id)
    doc = mongo.find_document('connectthedots', doc_id)
    return Response(doc.get('results')['gexf'], mimetype='application/xml')
Example #6
0
def results(doc_id):
    """
    Lookup results for a given document
    """
    try:
        results = mongo.find_document('connectthedots', doc_id).get('results')
        logger.info('[CTD] Showing results for doc: %s', doc_id)
        return render_results(doc_id)
    except Exception as e:
        logger.warning('[CTD] Unable to find doc: %s', doc_id)
        logger.warning('[CTD] Error: %s', str(e))
        return render_template('no_results.html', tool_name='connectthedots')
Example #7
0
def download_table(doc_id):
    """
    Download CSV of degree/centrality scores
    """
    logger.info('[CTD] Requesting CSV of table for doc: %s', doc_id)
    doc = mongo.find_document('connectthedots', doc_id)
    def as_csv(rows, headers):
        yield ','.join(headers) + '\n'
        for r in rows:
            yield ','.join(map(str, [r['id'], r['degree'], r['centrality'], r['community']])) + '\n'
    return Response(
        as_csv(doc.get('results')['table'], ['node', 'degree', 'betweenness centrality', 'community']),
        mimetype='text/csv')
Example #8
0
def render_results(doc_id):
    """
    Render results page
    """
    doc = mongo.find_document('connectthedots', doc_id)
    results = doc.get('results')

    if doc.get('source') != 'sample':
        remaining_days = mongo.get_remaining_days('connectthedots', doc_id)
    else:
        remaining_days = None

    first_mismatch = None  # get first centrality/degree mismatch
    degree_index = 0
    centrality_index = 0
    table_by_degree = sorted(results['table'],
                             key=operator.itemgetter('degree'),
                             reverse=True)
    table_by_centrality = results['table']

    for i, row in enumerate(table_by_degree):
        if row['id'] != table_by_centrality[i]['id']:
            first_mismatch = row['id']
            degree_index = i
            break

    if first_mismatch is not None:
        for i, row in enumerate(
                table_by_centrality[degree_index +
                                    1:]):  # start from where we left off
            if row['id'] == first_mismatch:
                centrality_index = i + degree_index + 1
                break

    what_next = {
        'mismatch_id': first_mismatch,
        'mismatch_degree': ordinal(degree_index + 1),
        'mismatch_centrality': ordinal(centrality_index + 1),
        'lowest_degree': table_by_degree[-1]['id']
    }

    biography = results['biography'] if 'biography' in results else None

    return render_template('connectthedots/results.html',
                           results=results,
                           whatnext=what_next,
                           tool_name='connectthedots',
                           source=doc['source'],
                           has_multiple_sheets=results['has_multiple_sheets'],
                           remaining_days=remaining_days,
                           biography=biography)
Example #9
0
def download_csv(doc_id, analysis_type):
    logger.debug("Download %s", analysis_type)
    if analysis_type not in ['words','bigrams','trigrams']:
        logger.warning("Requested unknown csv type: %s",analysis_type)
        abort(400)
    try:
        doc = mongo.find_document('wordcounter', doc_id)
    except:
        logger.warning("Unable to find doc '%s'", doc_id)
        abort(400)
    file_path = create_csv_file(doc.get('counts'),analysis_type)
    logger.debug('  created %s csv to download at %s', analysis_type, file_path)
    if file_path is None:
        abort(500)
    return filehandler.generate_csv(file_path)
Example #10
0
def download_csv(doc_id, analysis_type):
    logger.debug("Download %s", analysis_type)
    if analysis_type not in ['words','bigrams','trigrams']:
        logger.warning("Requested unknown csv type: %s",analysis_type)
        abort(400)
    try:
        doc = mongo.find_document('wordcounter', doc_id)
    except:
        logger.warning("Unable to find doc '%s'", doc_id)
        abort(400)
    file_path = create_csv_file(doc.get('counts'),analysis_type)
    logger.debug('  created %s csv to download at %s', analysis_type, file_path)
    if file_path is None:
        abort(500)
    return filehandler.generate_csv(file_path)
Example #11
0
def results_page(doc_id):
    try:
        results = mongo.find_document('wtfcsv', doc_id).get('results')
        # Doc has more than one sheet to analyze
        if len(results) > 1:
            logger.info("Showing results %s (sheet 0)", doc_id)
            submit = request.args.get('submit', '')
            param = '?submit=true' if 'true' in submit else ''
            return redirect(g.current_lang + '/wtfcsv/results/' + doc_id +
                            '/sheets/0' + param)
        else:
            logger.info("Showing results %s", doc_id)
            return render_results(doc_id, 0)
    except Exception as e:
        logger.warning("Unable to find doc '%s'", doc_id)
        logger.exception(e)

        return render_template('no_results.html', tool_name='wtfcsv')
Example #12
0
def render_results(doc_id):
    """
    Render results page
    """
    doc = mongo.find_document('connectthedots', doc_id)
    results = doc.get('results')

    if doc.get('source') != 'sample':
        remaining_days = mongo.get_remaining_days('connectthedots', doc_id)
    else:
        remaining_days = None

    first_mismatch = None # get first centrality/degree mismatch
    degree_index = 0
    centrality_index = 0
    table_by_degree = sorted(results['table'], key=operator.itemgetter('degree'), reverse=True)
    table_by_centrality = results['table']

    for i, row in enumerate(table_by_degree):
        if row['id'] != table_by_centrality[i]['id']:
            first_mismatch = row['id']
            degree_index = i
            break

    if first_mismatch is not None:
        for i, row in enumerate(table_by_centrality[degree_index + 1:]): # start from where we left off
            if row['id'] == first_mismatch:
                centrality_index = i + degree_index + 1
                break

    whatnext = {}
    whatnext['mismatch_id'] = first_mismatch
    whatnext['mismatch_degree'] = ordinal(degree_index + 1)
    whatnext['mismatch_centrality'] = ordinal(centrality_index + 1)
    whatnext['lowest_degree'] = table_by_degree[-1]['id']

    return render_template('connectthedots/results.html', 
                           results=results,
                           whatnext=whatnext,
                           tool_name='connectthedots',
                           source=doc['source'],
                           has_multiple_sheets=results['has_multiple_sheets'],
                           remaining_days=remaining_days)
Example #13
0
def download_table(doc_id):
    """
    Download CSV of degree/centrality scores
    """
    logger.info('[CTD] Requesting CSV of table for doc: %s', doc_id)
    doc = mongo.find_document('connectthedots', doc_id)

    def as_csv(rows, headers):
        yield ','.join(headers) + '\n'
        for r in rows:
            yield ','.join(
                map(str,
                    [r['id'], r['degree'], r['centrality'], r['community']
                     ])) + '\n'

    return Response(as_csv(
        doc.get('results')['table'],
        ['node', 'degree', 'betweenness centrality', 'community']),
                    mimetype='text/csv')
Example #14
0
def results(doc_id):
    try:
       
        results = mongo.find_document('wtfcsv', doc_id).get('results')
        
        #Doc has more than one sheet to analyze
        if len(results) > 1:
            logger.info("Showing results %s (sheet 0)", doc_id)
            submit = request.args.get('submit', '')
            param = '?submit=true' if 'true' in submit else ''
            return redirect(g.current_lang + '/wtfcsv/results/' + doc_id + '/sheets/0' + param)
        else:
            logger.info("Showing results %s", doc_id)
            return render_results(doc_id, 0)
    except:
        #more robust exception logging
        logger.warning("Unable to find doc '%s'", doc_id)
        logger.warning("Unexpected error:", sys.exc_info()[0])
        logger.warning(traceback.format_exc())

        return render_template('no_results.html', tool_name='wtfcsv')
Example #15
0
def results(doc_id):
    try:

        results = mongo.find_document('wtfcsv', doc_id).get('results')

        #Doc has more than one sheet to analyze
        if len(results) > 1:
            logger.info("Showing results %s (sheet 0)", doc_id)
            submit = request.args.get('submit', '')
            param = '?submit=true' if 'true' in submit else ''
            return redirect(g.current_lang + '/wtfcsv/results/' + doc_id +
                            '/sheets/0' + param)
        else:
            logger.info("Showing results %s", doc_id)
            return render_results(doc_id, 0)
    except:
        #more robust exception logging
        logger.warning("Unable to find doc '%s'", doc_id)
        logger.warning("Unexpected error:", sys.exc_info()[0])
        logger.warning(traceback.format_exc())

        return render_template('no_results.html', tool_name='wtfcsv')
Example #16
0
def download(doc_id):
    try:
        logger.debug("Download %s", doc_id)
        doc = mongo.find_document('samediff', doc_id)
        headers = [_('word'), _('uses in') +' ' + doc['filenames'][0], _('uses in') + ' ' + doc['filenames'][1], _('total uses')]
        rows = []
        for f, w in doc['sameWords']:
            doc1Count = next(f2 for f2, w2 in doc['mostFrequentDoc1'] if w == w2)
            doc2Count = next(f2 for f2, w2 in doc['mostFrequentDoc2'] if w == w2)
            rows.append([w, doc1Count, doc2Count, f])
        for f, w in doc['diffWordsDoc1']:
            rows.append([w, f, 0, f])
        for f, w in doc['diffWordsDoc1']:
            rows.append([w, 0, f, f])
        # TODO: clean up file name
        file_path = filehandler.write_to_csv(headers, rows, 
            filehandler.generate_filename('csv', '', doc['filenames'][0], doc['filenames'][1]), False)
        logger.debug('  created csv to download at %s', file_path)
        return filehandler.generate_csv(file_path)
    except Exception as e:
        logging.exception(e)
        abort(400)
Example #17
0
def render_results(doc_id, sheet_idx):

    doc = mongo.find_document('wtfcsv', doc_id)
    results = doc.get('results')

    if doc['sample_id'] == u'':
        remaining_days = mongo.get_remaining_days('wtfcsv', doc_id)
    else:
        remaining_days = None

    if 'bad_formatting' in results:
        return render_template('wtfcsv/results.html',
                               results=results,
                               tool_name='wtfcsv',
                               index=0)

    def get_random_column():
        return random.choice(results[int(sheet_idx)]['columns'])

    columns = results[int(sheet_idx)]['columns']

    if len(columns) < 1:
        whatnext = 'no_data'
    else:
        random_column = get_random_column()
        random_column2 = get_random_column()
        random_column3 = get_random_column()

        if len(columns) > 0 and next(
            (c for c in columns if 'most_freq_values' in c), None) is not None:
            while 'most_freq_values' not in random_column:
                random_column = get_random_column()

        if len(columns) > 1:
            while random_column2 == random_column:
                random_column2 = get_random_column()
        else:
            random_column2 = random_column

        if len(columns) > 2:
            while random_column3 == random_column or random_column3 == random_column2:
                random_column3 = get_random_column()
        else:
            random_column3 = random_column

        whatnext = {}
        if 'most_freq_values' in random_column and len(
                random_column['most_freq_values']) > 0:
            whatnext[
                'random_column_top_value'] = random_column['most_freq_values'][
                    0]['value'] if 'most_freq_values' in random_column else ''
        else:
            whatnext['random_column_top_value'] = 0
        whatnext['random_column_name'] = random_column['name']
        whatnext['random_column_name2'] = random_column2['name']
        whatnext['random_column_name3'] = random_column3['name']

    # build a list of summary result data for the chart
    for col in columns:
        is_string = 'text' in col['display_type_name']
        data_to_use = []
        # pick the right results to summarize
        if 'deciles' in col:
            data_to_use = col['deciles']
        elif 'most_freq_values' in col:
            data_to_use = col['most_freq_values']
        elif 'word_counts' in col:
            #for word in col['word_counts']['unique_words'][:20]:
            #    print str(word[0]) + " is " + str(word[1])
            data_to_use = [{
                'value': word[0],
                'count': word[1]
            } for word in col['word_counts']['unique_words'][:20]]
        # stitch together the overview
        overview_data = {'categories': [], 'values': []}
        for d in data_to_use:
            key = str(d['value']) if is_string else str(d['value']).replace(
                '_', '.')
            overview_data['categories'].append(key)
            overview_data['values'].append(d['count'])
        if 'others' in col:
            overview_data['categories'].append(gettext('Other'))
            overview_data['values'].append(int(col['others']))
        col['overview'] = overview_data
    return render_template('wtfcsv/results.html',
                           results=results,
                           whatnext=whatnext,
                           tool_name='wtfcsv',
                           index=int(sheet_idx),
                           source=doc['source'],
                           remaining_days=remaining_days)
Example #18
0
def results_for_doc(doc_id):
    results = {}
    remaining_days = None

    try:
        doc = mongo.find_document('wordcounter', doc_id)
        if doc['sample_id'] == '':
            remaining_days = mongo.get_remaining_days('wordcounter', doc_id)
    except:
        logger.warning("Unable to find doc '%s'", doc_id)
        return render_template('no_results.html', tool_name='wordcounter')

    counts = doc.get('counts')

    # only render the top 40 results on the page (the csv contains all results)
    results['unique_words'] = counts['unique_words'][:40]
    results['bigrams'] = counts['bigrams'][:40]
    results['trigrams'] = counts['trigrams'][:40]

    max_index = min(20, len(results['unique_words']))
    min_index = max(0, max_index - 5)
    random_unpopular_word = ['', '']
    top_word = ''
    word_in_bigrams_count = 0
    word_in_trigrams_count = 0

    if len(results['unique_words']) > 0:
        random_unpopular_word = results['unique_words'][random.randrange(min_index, max_index+1)]\
            if len(results['unique_words']) > 1 else results['unique_words'][0]
        '''
        Find the most popular word that is also present in bigrams and trigrams. 
        If none can be found, just get the most popular word.
        '''

        if results['unique_words'] and results['bigrams'] and results[
                'trigrams']:
            for word in results['unique_words']:
                top_word = word[0]
                word_in_bigrams_count = 0
                word_in_trigrams_count = 0
                for b in results['bigrams']:
                    if top_word in b[0]:
                        word_in_bigrams_count += 1
                for t in results['trigrams']:
                    if top_word in t[0]:
                        word_in_trigrams_count += 1
                if word_in_bigrams_count > 0 and word_in_trigrams_count > 0:
                    break

        if word_in_bigrams_count == 0 and word_in_trigrams_count == 0:
            top_word = results['unique_words'][0][0]

    whatnext = {}
    whatnext['top_word'] = top_word
    whatnext['word_in_bigrams_count'] = word_in_bigrams_count
    whatnext['word_in_trigrams_count'] = word_in_trigrams_count
    whatnext['random_unpopular_word'] = random_unpopular_word[0]
    whatnext['random_unpopular_word_count'] = random_unpopular_word[1]
    biography = doc['biography'] if 'biography' in doc else None

    return render_template('wordcounter/results.html',
                           results=results,
                           whatnext=whatnext,
                           tool_name='wordcounter',
                           title=doc['title'],
                           doc_id=doc_id,
                           source=doc['source'],
                           remaining_days=remaining_days,
                           total_words=counts['total_word_count'],
                           biography=biography)
Example #19
0
def render_results(doc_id, sheet_idx):

    doc = mongo.find_document('wtfcsv', doc_id)
    results = doc.get('results')

    if doc['sample_id'] == u'':
        remaining_days = mongo.get_remaining_days('wtfcsv', doc_id)
    else:
        remaining_days = None

    if 'bad_formatting' in results:
        return render_template('wtfcsv/results.html', results=results, tool_name='wtfcsv', index=0)

    def get_random_column():
        return random.choice(results[int(sheet_idx)]['columns'])

    columns = results[int(sheet_idx)]['columns']

    if len(columns) < 1:
        whatnext = 'no_data'
    else:
        random_column = get_random_column()
        random_column2 = get_random_column()
        random_column3 = get_random_column()

        if len(columns) > 0 and next((c for c in columns if 'most_freq_values' in c), None) is not None:
            while 'most_freq_values' not in random_column:
                random_column = get_random_column()

        if len(columns) > 1:
            while random_column2 == random_column:
                random_column2 = get_random_column()
        else:
            random_column2 = random_column
        
        if len(columns) > 2:
            while random_column3 == random_column or random_column3 == random_column2:
                random_column3 = get_random_column()
        else:
            random_column3 = random_column

        whatnext = {}
        if 'most_freq_values' in random_column and len(random_column['most_freq_values']) > 0:
            whatnext['random_column_top_value'] = random_column['most_freq_values'][0]['value'] if 'most_freq_values' in random_column else ''
        else:
            whatnext['random_column_top_value'] = 0
        whatnext['random_column_name'] = random_column['name']
        whatnext['random_column_name2'] = random_column2['name']
        whatnext['random_column_name3'] = random_column3['name']

    # build a list of summary result data for the chart
    for col in columns:
        is_string = 'text' in col['display_type_name']
        data_to_use = []
        # pick the right results to summarize
        if 'deciles' in col:
            data_to_use = col['deciles']
        elif 'most_freq_values' in col:
            data_to_use = col['most_freq_values']
        elif 'word_counts' in col:
            for word in col['word_counts']['unique_words'][:20]:
                print str(word[0]) + " is " + str(word[1])
            data_to_use = [ {'value':word[0], 'count':word[1]} for word in col['word_counts']['unique_words'][:20] ]
        # stitch together the overview
        overview_data = {'categories':[],'values':[]}
        for d in data_to_use:
            key = str(d['value']) if is_string else str(d['value']).replace('_', '.')
            overview_data['categories'].append(key)
            overview_data['values'].append(d['count'])
        if 'others' in col:
            overview_data['categories'].append(gettext('Other'))
            overview_data['values'].append(int(col['others']))
        col['overview'] = overview_data
    return render_template('wtfcsv/results.html', 
        results=results, 
        whatnext=whatnext, 
        tool_name='wtfcsv', 
        index=int(sheet_idx), 
        source=doc['source'],
        remaining_days=remaining_days)
Example #20
0
def results(doc_id):
    
    counts = None
    results = {}
    remaining_days = None

    try:
        doc = mongo.find_document('wordcounter', doc_id)
        if doc['sample_id'] == u'':
            remaining_days = mongo.get_remaining_days('wordcounter', doc_id)
    except:
        logger.warning("Unable to find doc '%s'", doc_id)
        return render_template('no_results.html', tool_name='wordcounter')

    counts = doc.get('counts')
    
    # only render the top 40 results on the page (the csv contains all results)
    results['unique_words'] = counts['unique_words'][:40]
    results['bigrams'] = counts['bigrams'][:40]
    results['trigrams'] = counts['trigrams'][:40]
    

    max_index = min(20, len(results['unique_words']))
    min_index = max(0, max_index-5)
    random_unpopular_word = ['','']
    top_word = ''
    word_in_bigrams_count = 0
    word_in_trigrams_count = 0

    if len(results['unique_words']) > 0:
        random_unpopular_word = results['unique_words'][random.randrange(min_index, max_index+1)] if len(results['unique_words']) > 1 else results['unique_words'][0]

        '''
        Find the most popular word that is also present in bigrams and trigrams. 
        If none can be found, just get the most popular word.
        '''

        if results['unique_words'] and results['bigrams'] and results['trigrams']:
            for word in results['unique_words']:
                top_word = word[0]
                word_in_bigrams_count = 0
                word_in_trigrams_count = 0
                for b in results['bigrams']:
                    if top_word in b[0]:
                        word_in_bigrams_count += 1
                for t in results['trigrams']:
                    if top_word in t[0]:
                        word_in_trigrams_count += 1
                if word_in_bigrams_count > 0 and word_in_trigrams_count > 0:
                    break   

        if word_in_bigrams_count == 0 and word_in_trigrams_count == 0:
            top_word = results['unique_words'][0][0]

    whatnext = {}
    whatnext['top_word'] = top_word
    whatnext['word_in_bigrams_count'] = word_in_bigrams_count
    whatnext['word_in_trigrams_count'] = word_in_trigrams_count
    whatnext['random_unpopular_word'] = random_unpopular_word[0]
    whatnext['random_unpopular_word_count'] = random_unpopular_word[1]

    return render_template('wordcounter/results.html', 
        results=results, 
        whatnext=whatnext, 
        tool_name='wordcounter', 
        title=doc['title'], 
        doc_id=doc_id, 
        source=doc['source'], 
        remaining_days=remaining_days, 
        total_words=counts['total_word_count'])