def tag_pairs_page(): if 'user' not in request.cookies: return 'User must be logged in to tag news', 401 if not newsbreaker_initialized: newsbreaker.init( os.path.join(metadata_folder, 'topic_model'), 'topic_model.pkl', 'vocab.txt' ) globals()['newsbreaker_initialized'] = True user = request.cookies['user'] if request.method == 'GET': if 'stats' in request.args: n = tests_db['pairs'].find().count() return json.dumps( { 'user': user, 'count': n }, indent=2 ) try: basefeed = request.args.get('feed') or choice(feeds).name baseindex = int( request.args.get('index') or choice( [ entry for entry in entries_metadata[basefeed] if entry.data.get('newsbreaker') and \ entry.data.get('politics') ] ).index ) except KeyError as e: return 'Invalid feed: %s' % e.args, 400 except ValueError as e: return 'Invalid index: %s' % e.args, 400 for base_metadata in entries_metadata[basefeed]: if base_metadata.index == baseindex: break else: return '%s|%d not found' % (basefeed, baseindex), 400 if not base_metadata.data.get('newsbreaker', False): return '%s|%d isn\'t a filtered entry' % (basefeed, baseindex), 400 selected_date = base_metadata.data['date'] # Get all entries in the same date as base # that are breakable, and get them as BreakableEntries day_entries = load_entries( metadata_folder, it=( ( feed, ( entry for entry in entries_metadata[feed.name] if entry.data.get('date') == selected_date and \ entry.data.get('newsbreaker', False) and \ entry.data.get('politics') ) ) for feed in feeds ) ) # Retrieve base as BreakableEntry for base in day_entries: if base.feedname == basefeed and base.index == baseindex: break else: return 'Base entry isn\'t breakable', 500 # Get the WHAT distance of all entries with base day_entries = [ (entry, entry.what_distance(base)) for entry in day_entries ] # Get all pairs of entries that are not base # ordering by dist_e1 + dist_e2 ascending tests = [ (e1, e2) for e1, e2, _ in sorted( ( (e1, e2, d1 + d2) for i, (e1, d1) in enumerate(day_entries) for j, (e2, d2) in enumerate(day_entries) if i < j and e1 != base and e2 != base ), key=lambda t: t[2] ) ] # Get a sample of as much as request.args.get('maxtests', MAX_TESTS_PAIRS) try: max_tests = int(request.args.get('maxtests')) except: max_tests = MAX_TESTS_PAIRS tests = sample( tests, min(len(tests), max_tests) ) if not tests: # No more entries in this date return '%s|%d cannot be used for tagger' % (basefeed, baseindex), 400 # Only filtered entries remain now return render_template('tag_pairs.html', title='Pairs tagger', base={ 'feedname': base.feedname, 'index': base.index, 'title': base.title, 'content': base.content, }, tests=[ [ { 'feedname': e1.feedname, 'index': e1.index, 'title': e1.title, 'content': e1.content, }, { 'feedname': e2.feedname, 'index': e2.index, 'title': e2.title, 'content': e2.content, } ] for e1, e2 in tests ], enumerate=enumerate, # pass it to jinja ) elif request.method == 'POST': if 'base' not in request.form: return 'Base not found in form inputs', 400 tests = [] for k, v in request.form.items(): if regex_test.fullmatch(k): tests.append((k, str(v))) # str it just in case response = redirect( url_for('get_base_for_pairs') + '?maxtests=%d' % len(tests) ) if not tests: return response tests = [ [ request.form[testname + 'e1'], request.form[testname + 'e2'], int(testvalue == '1') # 0 or 1 ] for testname, testvalue in tests if testvalue in ('-1', '1') ] if not tests: return response # don't insert to tests_db without any values tests_db['pairs'].insert_many( { 'base': request.form['base'], 'e1': test[0], 'e2': test[1], 'res': test[2], 'user': user } for test in tests ) return response
def tag_pairs_page(): if 'user' not in request.cookies: return 'User must be logged in to tag news', 401 if not newsbreaker_initialized: newsbreaker.init(os.path.join(metadata_folder, 'topic_model'), 'topic_model.pkl', 'vocab.txt') globals()['newsbreaker_initialized'] = True user = request.cookies['user'] if request.method == 'GET': if 'stats' in request.args: n = tests_db['pairs'].find().count() return json.dumps({'user': user, 'count': n}, indent=2) try: basefeed = request.args.get('feed') or choice(feeds).name baseindex = int( request.args.get('index') or choice( [ entry for entry in entries_metadata[basefeed] if entry.data.get('newsbreaker') and \ entry.data.get('politics') ] ).index ) except KeyError as e: return 'Invalid feed: %s' % e.args, 400 except ValueError as e: return 'Invalid index: %s' % e.args, 400 for base_metadata in entries_metadata[basefeed]: if base_metadata.index == baseindex: break else: return '%s|%d not found' % (basefeed, baseindex), 400 if not base_metadata.data.get('newsbreaker', False): return '%s|%d isn\'t a filtered entry' % (basefeed, baseindex), 400 selected_date = base_metadata.data['date'] # Get all entries in the same date as base # that are breakable, and get them as BreakableEntries day_entries = load_entries( metadata_folder, it=( ( feed, ( entry for entry in entries_metadata[feed.name] if entry.data.get('date') == selected_date and \ entry.data.get('newsbreaker', False) and \ entry.data.get('politics') ) ) for feed in feeds ) ) # Retrieve base as BreakableEntry for base in day_entries: if base.feedname == basefeed and base.index == baseindex: break else: return 'Base entry isn\'t breakable', 500 # Get the WHAT distance of all entries with base day_entries = [(entry, entry.what_distance(base)) for entry in day_entries] # Get all pairs of entries that are not base # ordering by dist_e1 + dist_e2 ascending tests = [ (e1, e2) for e1, e2, _ in sorted(((e1, e2, d1 + d2) for i, (e1, d1) in enumerate(day_entries) for j, (e2, d2) in enumerate(day_entries) if i < j and e1 != base and e2 != base), key=lambda t: t[2]) ] # Get a sample of as much as request.args.get('maxtests', MAX_TESTS_PAIRS) try: max_tests = int(request.args.get('maxtests')) except: max_tests = MAX_TESTS_PAIRS tests = sample(tests, min(len(tests), max_tests)) if not tests: # No more entries in this date return '%s|%d cannot be used for tagger' % (basefeed, baseindex), 400 # Only filtered entries remain now return render_template( 'tag_pairs.html', title='Pairs tagger', base={ 'feedname': base.feedname, 'index': base.index, 'title': base.title, 'content': base.content, }, tests=[[{ 'feedname': e1.feedname, 'index': e1.index, 'title': e1.title, 'content': e1.content, }, { 'feedname': e2.feedname, 'index': e2.index, 'title': e2.title, 'content': e2.content, }] for e1, e2 in tests], enumerate=enumerate, # pass it to jinja ) elif request.method == 'POST': if 'base' not in request.form: return 'Base not found in form inputs', 400 tests = [] for k, v in request.form.items(): if regex_test.fullmatch(k): tests.append((k, str(v))) # str it just in case response = redirect( url_for('get_base_for_pairs') + '?maxtests=%d' % len(tests)) if not tests: return response tests = [ [ request.form[testname + 'e1'], request.form[testname + 'e2'], int(testvalue == '1') # 0 or 1 ] for testname, testvalue in tests if testvalue in ('-1', '1') ] if not tests: return response # don't insert to tests_db without any values tests_db['pairs'].insert_many({ 'base': request.form['base'], 'e1': test[0], 'e2': test[1], 'res': test[2], 'user': user } for test in tests) return response
index = int(index) # just in case for entry in entries: if entry.feedname == feedname and entry.index == index: return entry else: raise KeyError((feedname, index)) if __name__ == '__main__': viz = sys.argv[1] feedname = sys.argv[2] index = sys.argv[3] folder = 'data' init(os.path.join(folder, 'topic_model'), 'topic_model.pkl', 'vocab.txt') entries = load_entries(folder) # Filter entries: only politics entries = [entry for entry in entries if entry.data.get('politics')] from pymongo import MongoClient mongo_client = MongoClient() dists_db = mongo_client.distances collection = '_'.join(sys.argv[1:4]) col = getattr(dists_db, collection) base = get_entry(feedname, index)