def download(doc_id): try: logger.debug("Download %s", doc_id) doc = mongo.find_document('samediff', doc_id) headers = [ _('word'), _('uses in') + ' ' + doc['filenames'][0], _('uses in') + ' ' + doc['filenames'][1], _('total uses') ] rows = [] for f, w in doc['sameWords']: doc1Count = next(f2 for f2, w2 in doc['mostFrequentDoc1'] if w == w2) doc2Count = next(f2 for f2, w2 in doc['mostFrequentDoc2'] if w == w2) rows.append([w, doc1Count, doc2Count, f]) for f, w in doc['diffWordsDoc1']: rows.append([w, f, 0, f]) for f, w in doc['diffWordsDoc1']: rows.append([w, 0, f, f]) # TODO: clean up file name file_path = filehandler.write_to_csv( headers, rows, filehandler.generate_filename('csv', '', doc['filenames'][0], doc['filenames'][1]), False) logger.debug(' created csv to download at %s', file_path) return filehandler.generate_csv(file_path) except Exception as e: logging.exception(e) abort(400)
def download_gexf(doc_id): """ Download GEXF file """ logger.info('[CTD] Requesting GEXF for doc: %s', doc_id) doc = mongo.find_document('connectthedots', doc_id) return Response(doc.get('results')['gexf'], mimetype='application/xml')
def results(doc_id): remaining_days = None try: job = mongo.find_document('samediff', doc_id) if job['sample_id'] == u'': remaining_days = mongo.get_remaining_days('samediff', doc_id) except: logger.warning("Unable to find doc '%s'", doc_id) return render_template('no_results.html', tool_name='samediff') whatnext = {} whatnext['most_common_word'] = job['sameWords'][0][1] if len(job['sameWords']) > 0 else '' whatnext['second_most_common_word'] = job['sameWords'][1][1] if len(job['sameWords']) > 1 else '' whatnext['doc2_most_common_word'] = job['diffWordsDoc2'][0][1] if len(job['diffWordsDoc2']) > 0 else '' if(job['totalWordsDoc1'] > job['totalWordsDoc2']): pct_length_diff = float(job['totalWordsDoc1'] - job['totalWordsDoc2']) / float(job['totalWordsDoc1']) else: pct_length_diff = float(job['totalWordsDoc2'] - job['totalWordsDoc1']) / float(job['totalWordsDoc2']) return render_template('samediff/results.html', results=job, pct_length_diff = pct_length_diff, cosine_similarity= {'score':job['cosineSimilarity'],'description':interpretCosineSimilarity(job['cosineSimilarity'])}, whatnext=whatnext, tool_name='samediff', doc_id=doc_id, remaining_days=remaining_days)
def results(doc_id): """ Lookup results for a given document """ try: results = mongo.find_document('connectthedots', doc_id).get('results') logger.info('[CTD] Showing results for doc: %s', doc_id) return render_results(doc_id) except Exception as e: logger.warning('[CTD] Unable to find doc: %s', doc_id) logger.warning('[CTD] Error: %s', str(e)) return render_template('no_results.html', tool_name='connectthedots')
def download_table(doc_id): """ Download CSV of degree/centrality scores """ logger.info('[CTD] Requesting CSV of table for doc: %s', doc_id) doc = mongo.find_document('connectthedots', doc_id) def as_csv(rows, headers): yield ','.join(headers) + '\n' for r in rows: yield ','.join(map(str, [r['id'], r['degree'], r['centrality'], r['community']])) + '\n' return Response( as_csv(doc.get('results')['table'], ['node', 'degree', 'betweenness centrality', 'community']), mimetype='text/csv')
def render_results(doc_id): """ Render results page """ doc = mongo.find_document('connectthedots', doc_id) results = doc.get('results') if doc.get('source') != 'sample': remaining_days = mongo.get_remaining_days('connectthedots', doc_id) else: remaining_days = None first_mismatch = None # get first centrality/degree mismatch degree_index = 0 centrality_index = 0 table_by_degree = sorted(results['table'], key=operator.itemgetter('degree'), reverse=True) table_by_centrality = results['table'] for i, row in enumerate(table_by_degree): if row['id'] != table_by_centrality[i]['id']: first_mismatch = row['id'] degree_index = i break if first_mismatch is not None: for i, row in enumerate( table_by_centrality[degree_index + 1:]): # start from where we left off if row['id'] == first_mismatch: centrality_index = i + degree_index + 1 break what_next = { 'mismatch_id': first_mismatch, 'mismatch_degree': ordinal(degree_index + 1), 'mismatch_centrality': ordinal(centrality_index + 1), 'lowest_degree': table_by_degree[-1]['id'] } biography = results['biography'] if 'biography' in results else None return render_template('connectthedots/results.html', results=results, whatnext=what_next, tool_name='connectthedots', source=doc['source'], has_multiple_sheets=results['has_multiple_sheets'], remaining_days=remaining_days, biography=biography)
def download_csv(doc_id, analysis_type): logger.debug("Download %s", analysis_type) if analysis_type not in ['words','bigrams','trigrams']: logger.warning("Requested unknown csv type: %s",analysis_type) abort(400) try: doc = mongo.find_document('wordcounter', doc_id) except: logger.warning("Unable to find doc '%s'", doc_id) abort(400) file_path = create_csv_file(doc.get('counts'),analysis_type) logger.debug(' created %s csv to download at %s', analysis_type, file_path) if file_path is None: abort(500) return filehandler.generate_csv(file_path)
def results_page(doc_id): try: results = mongo.find_document('wtfcsv', doc_id).get('results') # Doc has more than one sheet to analyze if len(results) > 1: logger.info("Showing results %s (sheet 0)", doc_id) submit = request.args.get('submit', '') param = '?submit=true' if 'true' in submit else '' return redirect(g.current_lang + '/wtfcsv/results/' + doc_id + '/sheets/0' + param) else: logger.info("Showing results %s", doc_id) return render_results(doc_id, 0) except Exception as e: logger.warning("Unable to find doc '%s'", doc_id) logger.exception(e) return render_template('no_results.html', tool_name='wtfcsv')
def render_results(doc_id): """ Render results page """ doc = mongo.find_document('connectthedots', doc_id) results = doc.get('results') if doc.get('source') != 'sample': remaining_days = mongo.get_remaining_days('connectthedots', doc_id) else: remaining_days = None first_mismatch = None # get first centrality/degree mismatch degree_index = 0 centrality_index = 0 table_by_degree = sorted(results['table'], key=operator.itemgetter('degree'), reverse=True) table_by_centrality = results['table'] for i, row in enumerate(table_by_degree): if row['id'] != table_by_centrality[i]['id']: first_mismatch = row['id'] degree_index = i break if first_mismatch is not None: for i, row in enumerate(table_by_centrality[degree_index + 1:]): # start from where we left off if row['id'] == first_mismatch: centrality_index = i + degree_index + 1 break whatnext = {} whatnext['mismatch_id'] = first_mismatch whatnext['mismatch_degree'] = ordinal(degree_index + 1) whatnext['mismatch_centrality'] = ordinal(centrality_index + 1) whatnext['lowest_degree'] = table_by_degree[-1]['id'] return render_template('connectthedots/results.html', results=results, whatnext=whatnext, tool_name='connectthedots', source=doc['source'], has_multiple_sheets=results['has_multiple_sheets'], remaining_days=remaining_days)
def download_table(doc_id): """ Download CSV of degree/centrality scores """ logger.info('[CTD] Requesting CSV of table for doc: %s', doc_id) doc = mongo.find_document('connectthedots', doc_id) def as_csv(rows, headers): yield ','.join(headers) + '\n' for r in rows: yield ','.join( map(str, [r['id'], r['degree'], r['centrality'], r['community'] ])) + '\n' return Response(as_csv( doc.get('results')['table'], ['node', 'degree', 'betweenness centrality', 'community']), mimetype='text/csv')
def results(doc_id): try: results = mongo.find_document('wtfcsv', doc_id).get('results') #Doc has more than one sheet to analyze if len(results) > 1: logger.info("Showing results %s (sheet 0)", doc_id) submit = request.args.get('submit', '') param = '?submit=true' if 'true' in submit else '' return redirect(g.current_lang + '/wtfcsv/results/' + doc_id + '/sheets/0' + param) else: logger.info("Showing results %s", doc_id) return render_results(doc_id, 0) except: #more robust exception logging logger.warning("Unable to find doc '%s'", doc_id) logger.warning("Unexpected error:", sys.exc_info()[0]) logger.warning(traceback.format_exc()) return render_template('no_results.html', tool_name='wtfcsv')
def download(doc_id): try: logger.debug("Download %s", doc_id) doc = mongo.find_document('samediff', doc_id) headers = [_('word'), _('uses in') +' ' + doc['filenames'][0], _('uses in') + ' ' + doc['filenames'][1], _('total uses')] rows = [] for f, w in doc['sameWords']: doc1Count = next(f2 for f2, w2 in doc['mostFrequentDoc1'] if w == w2) doc2Count = next(f2 for f2, w2 in doc['mostFrequentDoc2'] if w == w2) rows.append([w, doc1Count, doc2Count, f]) for f, w in doc['diffWordsDoc1']: rows.append([w, f, 0, f]) for f, w in doc['diffWordsDoc1']: rows.append([w, 0, f, f]) # TODO: clean up file name file_path = filehandler.write_to_csv(headers, rows, filehandler.generate_filename('csv', '', doc['filenames'][0], doc['filenames'][1]), False) logger.debug(' created csv to download at %s', file_path) return filehandler.generate_csv(file_path) except Exception as e: logging.exception(e) abort(400)
def render_results(doc_id, sheet_idx): doc = mongo.find_document('wtfcsv', doc_id) results = doc.get('results') if doc['sample_id'] == u'': remaining_days = mongo.get_remaining_days('wtfcsv', doc_id) else: remaining_days = None if 'bad_formatting' in results: return render_template('wtfcsv/results.html', results=results, tool_name='wtfcsv', index=0) def get_random_column(): return random.choice(results[int(sheet_idx)]['columns']) columns = results[int(sheet_idx)]['columns'] if len(columns) < 1: whatnext = 'no_data' else: random_column = get_random_column() random_column2 = get_random_column() random_column3 = get_random_column() if len(columns) > 0 and next( (c for c in columns if 'most_freq_values' in c), None) is not None: while 'most_freq_values' not in random_column: random_column = get_random_column() if len(columns) > 1: while random_column2 == random_column: random_column2 = get_random_column() else: random_column2 = random_column if len(columns) > 2: while random_column3 == random_column or random_column3 == random_column2: random_column3 = get_random_column() else: random_column3 = random_column whatnext = {} if 'most_freq_values' in random_column and len( random_column['most_freq_values']) > 0: whatnext[ 'random_column_top_value'] = random_column['most_freq_values'][ 0]['value'] if 'most_freq_values' in random_column else '' else: whatnext['random_column_top_value'] = 0 whatnext['random_column_name'] = random_column['name'] whatnext['random_column_name2'] = random_column2['name'] whatnext['random_column_name3'] = random_column3['name'] # build a list of summary result data for the chart for col in columns: is_string = 'text' in col['display_type_name'] data_to_use = [] # pick the right results to summarize if 'deciles' in col: data_to_use = col['deciles'] elif 'most_freq_values' in col: data_to_use = col['most_freq_values'] elif 'word_counts' in col: #for word in col['word_counts']['unique_words'][:20]: # print str(word[0]) + " is " + str(word[1]) data_to_use = [{ 'value': word[0], 'count': word[1] } for word in col['word_counts']['unique_words'][:20]] # stitch together the overview overview_data = {'categories': [], 'values': []} for d in data_to_use: key = str(d['value']) if is_string else str(d['value']).replace( '_', '.') overview_data['categories'].append(key) overview_data['values'].append(d['count']) if 'others' in col: overview_data['categories'].append(gettext('Other')) overview_data['values'].append(int(col['others'])) col['overview'] = overview_data return render_template('wtfcsv/results.html', results=results, whatnext=whatnext, tool_name='wtfcsv', index=int(sheet_idx), source=doc['source'], remaining_days=remaining_days)
def results_for_doc(doc_id): results = {} remaining_days = None try: doc = mongo.find_document('wordcounter', doc_id) if doc['sample_id'] == '': remaining_days = mongo.get_remaining_days('wordcounter', doc_id) except: logger.warning("Unable to find doc '%s'", doc_id) return render_template('no_results.html', tool_name='wordcounter') counts = doc.get('counts') # only render the top 40 results on the page (the csv contains all results) results['unique_words'] = counts['unique_words'][:40] results['bigrams'] = counts['bigrams'][:40] results['trigrams'] = counts['trigrams'][:40] max_index = min(20, len(results['unique_words'])) min_index = max(0, max_index - 5) random_unpopular_word = ['', ''] top_word = '' word_in_bigrams_count = 0 word_in_trigrams_count = 0 if len(results['unique_words']) > 0: random_unpopular_word = results['unique_words'][random.randrange(min_index, max_index+1)]\ if len(results['unique_words']) > 1 else results['unique_words'][0] ''' Find the most popular word that is also present in bigrams and trigrams. If none can be found, just get the most popular word. ''' if results['unique_words'] and results['bigrams'] and results[ 'trigrams']: for word in results['unique_words']: top_word = word[0] word_in_bigrams_count = 0 word_in_trigrams_count = 0 for b in results['bigrams']: if top_word in b[0]: word_in_bigrams_count += 1 for t in results['trigrams']: if top_word in t[0]: word_in_trigrams_count += 1 if word_in_bigrams_count > 0 and word_in_trigrams_count > 0: break if word_in_bigrams_count == 0 and word_in_trigrams_count == 0: top_word = results['unique_words'][0][0] whatnext = {} whatnext['top_word'] = top_word whatnext['word_in_bigrams_count'] = word_in_bigrams_count whatnext['word_in_trigrams_count'] = word_in_trigrams_count whatnext['random_unpopular_word'] = random_unpopular_word[0] whatnext['random_unpopular_word_count'] = random_unpopular_word[1] biography = doc['biography'] if 'biography' in doc else None return render_template('wordcounter/results.html', results=results, whatnext=whatnext, tool_name='wordcounter', title=doc['title'], doc_id=doc_id, source=doc['source'], remaining_days=remaining_days, total_words=counts['total_word_count'], biography=biography)
def render_results(doc_id, sheet_idx): doc = mongo.find_document('wtfcsv', doc_id) results = doc.get('results') if doc['sample_id'] == u'': remaining_days = mongo.get_remaining_days('wtfcsv', doc_id) else: remaining_days = None if 'bad_formatting' in results: return render_template('wtfcsv/results.html', results=results, tool_name='wtfcsv', index=0) def get_random_column(): return random.choice(results[int(sheet_idx)]['columns']) columns = results[int(sheet_idx)]['columns'] if len(columns) < 1: whatnext = 'no_data' else: random_column = get_random_column() random_column2 = get_random_column() random_column3 = get_random_column() if len(columns) > 0 and next((c for c in columns if 'most_freq_values' in c), None) is not None: while 'most_freq_values' not in random_column: random_column = get_random_column() if len(columns) > 1: while random_column2 == random_column: random_column2 = get_random_column() else: random_column2 = random_column if len(columns) > 2: while random_column3 == random_column or random_column3 == random_column2: random_column3 = get_random_column() else: random_column3 = random_column whatnext = {} if 'most_freq_values' in random_column and len(random_column['most_freq_values']) > 0: whatnext['random_column_top_value'] = random_column['most_freq_values'][0]['value'] if 'most_freq_values' in random_column else '' else: whatnext['random_column_top_value'] = 0 whatnext['random_column_name'] = random_column['name'] whatnext['random_column_name2'] = random_column2['name'] whatnext['random_column_name3'] = random_column3['name'] # build a list of summary result data for the chart for col in columns: is_string = 'text' in col['display_type_name'] data_to_use = [] # pick the right results to summarize if 'deciles' in col: data_to_use = col['deciles'] elif 'most_freq_values' in col: data_to_use = col['most_freq_values'] elif 'word_counts' in col: for word in col['word_counts']['unique_words'][:20]: print str(word[0]) + " is " + str(word[1]) data_to_use = [ {'value':word[0], 'count':word[1]} for word in col['word_counts']['unique_words'][:20] ] # stitch together the overview overview_data = {'categories':[],'values':[]} for d in data_to_use: key = str(d['value']) if is_string else str(d['value']).replace('_', '.') overview_data['categories'].append(key) overview_data['values'].append(d['count']) if 'others' in col: overview_data['categories'].append(gettext('Other')) overview_data['values'].append(int(col['others'])) col['overview'] = overview_data return render_template('wtfcsv/results.html', results=results, whatnext=whatnext, tool_name='wtfcsv', index=int(sheet_idx), source=doc['source'], remaining_days=remaining_days)
def results(doc_id): counts = None results = {} remaining_days = None try: doc = mongo.find_document('wordcounter', doc_id) if doc['sample_id'] == u'': remaining_days = mongo.get_remaining_days('wordcounter', doc_id) except: logger.warning("Unable to find doc '%s'", doc_id) return render_template('no_results.html', tool_name='wordcounter') counts = doc.get('counts') # only render the top 40 results on the page (the csv contains all results) results['unique_words'] = counts['unique_words'][:40] results['bigrams'] = counts['bigrams'][:40] results['trigrams'] = counts['trigrams'][:40] max_index = min(20, len(results['unique_words'])) min_index = max(0, max_index-5) random_unpopular_word = ['',''] top_word = '' word_in_bigrams_count = 0 word_in_trigrams_count = 0 if len(results['unique_words']) > 0: random_unpopular_word = results['unique_words'][random.randrange(min_index, max_index+1)] if len(results['unique_words']) > 1 else results['unique_words'][0] ''' Find the most popular word that is also present in bigrams and trigrams. If none can be found, just get the most popular word. ''' if results['unique_words'] and results['bigrams'] and results['trigrams']: for word in results['unique_words']: top_word = word[0] word_in_bigrams_count = 0 word_in_trigrams_count = 0 for b in results['bigrams']: if top_word in b[0]: word_in_bigrams_count += 1 for t in results['trigrams']: if top_word in t[0]: word_in_trigrams_count += 1 if word_in_bigrams_count > 0 and word_in_trigrams_count > 0: break if word_in_bigrams_count == 0 and word_in_trigrams_count == 0: top_word = results['unique_words'][0][0] whatnext = {} whatnext['top_word'] = top_word whatnext['word_in_bigrams_count'] = word_in_bigrams_count whatnext['word_in_trigrams_count'] = word_in_trigrams_count whatnext['random_unpopular_word'] = random_unpopular_word[0] whatnext['random_unpopular_word_count'] = random_unpopular_word[1] return render_template('wordcounter/results.html', results=results, whatnext=whatnext, tool_name='wordcounter', title=doc['title'], doc_id=doc_id, source=doc['source'], remaining_days=remaining_days, total_words=counts['total_word_count'])