def get(self): solr_args = dict(request.args) if 'max_groups' in solr_args: del solr_args['min_percent_word'] if 'min_occurrences_word' in solr_args: del solr_args['min_occurrences_word'] solr_args["rows"] = min(int(solr_args.get("rows", [current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS")])[0]), current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS")) solr_args['fields'] = ['id'] solr_args['defType'] = 'aqp' solr_args['tv'] = 'true' solr_args['tv.tf_idf'] = 'true' solr_args['tv.tf'] = 'true' solr_args['tv.positions'] ='false' solr_args['tf.offsets'] = 'false' solr_args['tv.fl'] ='abstract,title' solr_args['fl'] ='id,abstract,title' solr_args['wt'] = 'json' headers = {'X-Forwarded-Authorization' : request.headers.get('Authorization')} response = client().get(current_app.config.get("VIS_SERVICE_TVRH_PATH") , params = solr_args, headers=headers) if response.status_code == 200: data = response.json() else: return {"Error": "There was a connection error. Please try again later", "Error Info": response.text}, response.status_code if data: min_percent_word = request.args.get("min_percent_word", current_app.config.get("VIS_SERVICE_WC_MIN_PERCENT_WORD")) min_occurrences_word = request.args.get("min_occurrences_word", current_app.config.get("VIS_SERVICE_WC_MIN_OCCURRENCES_WORD")) word_cloud_json = word_cloud.generate_wordcloud(data, min_percent_word = min_percent_word, min_occurrences_word = min_occurrences_word) if word_cloud_json: return word_cloud_json, 200 else: return {"Error": "Empty word cloud. Try changing your minimum word parameters or expanding your query."}, 200
def post(self): solr_args = request.json if not solr_args: return {'Error' : 'there was a problem with your request', 'Error Info': 'no data provided with request'}, 403 if 'max_groups' in solr_args: del solr_args['min_percent_word'] if 'min_occurrences_word' in solr_args: del solr_args['min_occurrences_word'] elif 'query' in request.json: try: solr_args = json.loads(request.json["query"][0]) except Exception: return {'Error' : 'there was a problem with your request', 'Error Info': 'couldn\'t decode query, it should be json-encoded before being sent (so double encoded)'}, 403 solr_args["rows"] = min(int(solr_args.get("rows", [current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS")])[0]), current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS")) solr_args['fl'] ='abstract,title' solr_args['wt'] = 'json' headers = {'X-Forwarded-Authorization' : request.headers.get('Authorization')} response = client().get(current_app.config.get("VIS_SERVICE_SOLR_PATH") , params = solr_args, headers=headers) if response.status_code == 200: data = response.json() else: return {"Error": "There was a connection error. Please try again later", "Error Info": response.text}, response.status_code if data: records = [unicode(". ".join(d.get('title', '')[:current_app.config.get("VIS_SERVICE_WC_MAX_TITLE_SIZE")]) + ". " + d.get('abstract', '')[:current_app.config.get("VIS_SERVICE_WC_MAX_ABSTRACT_SIZE")]) for d in data["response"]["docs"]] word_cloud_json = word_cloud.generate_wordcloud(records, n_most_common=current_app.config.get("VIS_SERVICE_WC_MAX_WORDS"), n_threads=current_app.config.get("VIS_SERVICE_WC_THREADS"), accepted_pos=(u'NN', u'NNP', u'NNS', u'NNPS', u'JJ', u'RB', u'VB', u'VBD', u'VBG', u'VBN', u'VBP', u'VBZ')) if word_cloud_json: return word_cloud_json, 200 else: return {"Error": "Empty word cloud. Try changing your minimum word parameters or expanding your query."}, 200
def test_word_cloud_resource(self): self.maxDiff = None # function: add_punc_and_remove_redundancies # uses the text returned from solr to do some cleaning up of the idf info returned by solr, # reducing counts of token components of slashed or dashed words # after this point the solr text is ignored, only the tf/idf data is used tf_idf_dict = {'word':{'tf' :[3], 'tf-idf' : [0.5]}, 'dashed' : {'tf' :[1], 'tf-idf' : [0.5]}, 'slashed' : {'tf' :[1], 'tf-idf' : [0.5]}, 'dashedword' : {'tf' :[1], 'tf-idf' : [0.5]}, 'slashedword' : {'tf' :[1], 'tf-idf' : [0.5]}} text_list = ['word', 'dashed-word', 'slashed/word'] updated_info_dict = word_cloud.add_punc_and_remove_redundancies(tf_idf_dict, text_list) expected_outcome_info_dict = {'word':{'tf' :[1], 'tf-idf' : [0.5]}, 'dashed-word': {'tf' :[1], 'tf-idf' : [0.5]}, 'slashed/word' : {'tf' :[1], 'tf-idf' : [0.5]}, 'dashed' : {'tf' :[-1], 'tf-idf' : [0.5]}, 'slashed' : {'tf' :[0], 'tf-idf' : [0.5]}} self.assertEqual(updated_info_dict, expected_outcome_info_dict) # function: build_dict # is a parent function to add_punc_and_remove_redundancies that takes an tf idf info and text info # and returns a token and acronym dictionary. The token dictionary is grouped by stem and includes # a list of idf for each different word tf_idf_dict = { 'fakeId': { 'abstract': { 'word': { 'tf': [3], 'tf-idf': [0.5] }, 'dashed': { 'tf': [1], 'tf-idf': [0.5] }, 'slashed': { 'tf': [1], 'tf-idf': [0.5] }, 'dashedword': { 'tf': [1], 'tf-idf': [0.5] }, 'slashedword': { 'tf': [1], 'tf-idf': [0.5] } }, 'title': { 'research': { 'tf': [1], 'tf-idf': [0.1] }, 'researcher': { 'tf': [1], 'tf-idf': [0.9] }, 'acr::fake': { 'tf': [1], 'tf-idf': [0.5] } } } } text_list = [{'id': 'fakeId', 'abstract': 'word dashed-word slashed/word', 'title' : 'research researcher FAKE'}] expected_outcome_info_dict = ({'dashedword': {'idf': [0.5], 'tokens': {'dashed-word': 1}, 'record_count' : ['fakeId']}, 'research': {'idf': [0.9, 0.1], 'tokens': {'research': 1, 'researcher': 1}, 'record_count' : ['fakeId', 'fakeId']}, 'slashedword': {'idf': [0.5], 'tokens': {'slashed/word': 1}, 'record_count' : ['fakeId']}, 'word': {'idf': [0.5], 'tokens': {'word': 1}, 'record_count' : ['fakeId']}}, {'FAKE': {'idf': [0.5], 'total_occurrences': 1, 'record_count' : ['fakeId']}}) updated_info_dict = word_cloud.build_dict(tf_idf_dict, text_list) self.assertEqual(updated_info_dict, expected_outcome_info_dict) #function: combine_and_process_dicts #uses the expected outcome from the previous function combined_dict = word_cloud.combine_and_process_dicts(expected_outcome_info_dict[0], expected_outcome_info_dict[1]) expected_combined_dict = { 'dashed-word': {'idf': 0.5, 'total_occurrences' :1, 'record_count' :1 }, 'research' : {'idf': 0.5, 'total_occurrences' :2, 'record_count' :1 }, 'slashed/word':{'idf': 0.5, 'total_occurrences' :1, 'record_count' :1 }, 'word': {'idf': 0.5, 'total_occurrences' :1, 'record_count' :1 }, 'FAKE' : {'idf': 0.5, 'total_occurrences' :1, 'record_count' :1 } } self.assertEqual(combined_dict, expected_combined_dict) #testing the main word cloud generation function with large data processed_data = word_cloud.generate_wordcloud(input_js_word_cloud, min_occurrences_word=2, min_percent_word=3) self.assertEqual(json.loads(json.dumps(processed_data)), test_js_word_cloud) processed_data = word_cloud.generate_wordcloud(input_js_word_cloud, min_occurrences_word=5, min_percent_word=3) self.assertEqual(json.loads(json.dumps(processed_data)), test_json_word_cloud_min_occurrences)
def test_word_cloud_resource(self): self.maxDiff = None # function: add_punc_and_remove_redundancies # uses the text returned from solr to do some cleaning up of the idf info returned by solr, # reducing counts of token components of slashed or dashed words # after this point the solr text is ignored, only the tf/idf data is used tf_idf_dict = { 'word': { 'tf': [3], 'tf-idf': [0.5] }, 'dashed': { 'tf': [1], 'tf-idf': [0.5] }, 'slashed': { 'tf': [1], 'tf-idf': [0.5] }, 'dashedword': { 'tf': [1], 'tf-idf': [0.5] }, 'slashedword': { 'tf': [1], 'tf-idf': [0.5] } } text_list = ['word', 'dashed-word', 'slashed/word'] updated_info_dict = word_cloud.add_punc_and_remove_redundancies( tf_idf_dict, text_list) expected_outcome_info_dict = { 'word': { 'tf': [1], 'tf-idf': [0.5] }, 'dashed-word': { 'tf': [1], 'tf-idf': [0.5] }, 'slashed/word': { 'tf': [1], 'tf-idf': [0.5] }, 'dashed': { 'tf': [-1], 'tf-idf': [0.5] }, 'slashed': { 'tf': [0], 'tf-idf': [0.5] } } self.assertEqual(updated_info_dict, expected_outcome_info_dict) # function: build_dict # is a parent function to add_punc_and_remove_redundancies that takes an tf idf info and text info # and returns a token and acronym dictionary. The token dictionary is grouped by stem and includes # a list of idf for each different word tf_idf_dict = { 'fakeId': { 'abstract': { 'word': { 'tf': [3], 'tf-idf': [0.5] }, 'dashed': { 'tf': [1], 'tf-idf': [0.5] }, 'slashed': { 'tf': [1], 'tf-idf': [0.5] }, 'dashedword': { 'tf': [1], 'tf-idf': [0.5] }, 'slashedword': { 'tf': [1], 'tf-idf': [0.5] } }, 'title': { 'research': { 'tf': [1], 'tf-idf': [0.1] }, 'researcher': { 'tf': [1], 'tf-idf': [0.9] }, 'acr::fake': { 'tf': [1], 'tf-idf': [0.5] } } } } text_list = [{ 'id': 'fakeId', 'abstract': 'word dashed-word slashed/word', 'title': 'research researcher FAKE' }] expected_outcome_info_dict = ({ 'dashedword': { 'idf': [0.5], 'tokens': { 'dashed-word': 1 }, 'record_count': ['fakeId'] }, 'research': { 'idf': [0.9, 0.1], 'tokens': { 'research': 1, 'researcher': 1 }, 'record_count': ['fakeId', 'fakeId'] }, 'slashedword': { 'idf': [0.5], 'tokens': { 'slashed/word': 1 }, 'record_count': ['fakeId'] }, 'word': { 'idf': [0.5], 'tokens': { 'word': 1 }, 'record_count': ['fakeId'] } }, { 'FAKE': { 'idf': [0.5], 'total_occurrences': 1, 'record_count': ['fakeId'] } }) updated_info_dict = word_cloud.build_dict(tf_idf_dict, text_list) self.assertEqual(updated_info_dict, expected_outcome_info_dict) #function: combine_and_process_dicts #uses the expected outcome from the previous function combined_dict = word_cloud.combine_and_process_dicts( expected_outcome_info_dict[0], expected_outcome_info_dict[1]) expected_combined_dict = { 'dashed-word': { 'idf': 0.5, 'total_occurrences': 1, 'record_count': 1 }, 'research': { 'idf': 0.5, 'total_occurrences': 2, 'record_count': 1 }, 'slashed/word': { 'idf': 0.5, 'total_occurrences': 1, 'record_count': 1 }, 'word': { 'idf': 0.5, 'total_occurrences': 1, 'record_count': 1 }, 'FAKE': { 'idf': 0.5, 'total_occurrences': 1, 'record_count': 1 } } self.assertEqual(combined_dict, expected_combined_dict) #testing the main word cloud generation function with large data processed_data = word_cloud.generate_wordcloud(input_js_word_cloud, min_occurrences_word=2, min_percent_word=3) self.assertEqual(json.loads(json.dumps(processed_data)), test_js_word_cloud) processed_data = word_cloud.generate_wordcloud(input_js_word_cloud, min_occurrences_word=5, min_percent_word=3) self.assertEqual(json.loads(json.dumps(processed_data)), test_json_word_cloud_min_occurrences)
def post(self): solr_args = request.json if not solr_args: return { 'Error': 'there was a problem with your request', 'Error Info': 'no data provided with request' }, 403 if 'max_groups' in solr_args: del solr_args['min_percent_word'] if 'min_occurrences_word' in solr_args: del solr_args['min_occurrences_word'] elif 'query' in request.json: try: solr_args = json.loads(request.json["query"][0]) except Exception: return { 'Error': 'there was a problem with your request', 'Error Info': 'couldn\'t decode query, it should be json-encoded before being sent (so double encoded)' }, 403 solr_args["rows"] = min( int( solr_args.get( "rows", [current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS") ])[0]), current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS")) solr_args['fields'] = ['id'] solr_args['defType'] = 'aqp' solr_args['tv'] = 'true' solr_args['tv.tf_idf'] = 'true' solr_args['tv.tf'] = 'true' solr_args['tv.positions'] = 'false' solr_args['tf.offsets'] = 'false' solr_args['tv.fl'] = 'abstract,title' solr_args['fl'] = 'id,abstract,title' solr_args['wt'] = 'json' headers = { 'X-Forwarded-Authorization': request.headers.get('Authorization') } response = client().get( current_app.config.get("VIS_SERVICE_TVRH_PATH"), params=solr_args, headers=headers) if response.status_code == 200: data = response.json() else: return { "Error": "There was a connection error. Please try again later", "Error Info": response.text }, response.status_code if data: min_percent_word = request.args.get( "min_percent_word", current_app.config.get("VIS_SERVICE_WC_MIN_PERCENT_WORD")) min_occurrences_word = request.args.get( "min_occurrences_word", current_app.config.get("VIS_SERVICE_WC_MIN_OCCURRENCES_WORD")) word_cloud_json = word_cloud.generate_wordcloud( data, min_percent_word=min_percent_word, min_occurrences_word=min_occurrences_word) if word_cloud_json: return word_cloud_json, 200 else: return { "Error": "Empty word cloud. Try changing your minimum word parameters or expanding your query." }, 200
def post(self): solr_args = request.json if not solr_args: return { 'Error': 'there was a problem with your request', 'Error Info': 'no data provided with request' }, 403 if 'max_groups' in solr_args: del solr_args['min_percent_word'] if 'min_occurrences_word' in solr_args: del solr_args['min_occurrences_word'] elif 'query' in request.json: try: solr_args = json.loads(request.json["query"][0]) except Exception: return { 'Error': 'there was a problem with your request', 'Error Info': 'couldn\'t decode query, it should be json-encoded before being sent (so double encoded)' }, 403 solr_args["rows"] = min( int( solr_args.get( "rows", [current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS") ])[0]), current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS")) solr_args['fl'] = 'abstract,title' solr_args['wt'] = 'json' headers = { 'Authorization': request.headers.get('X-Forwarded-Authorization', request.headers.get('Authorization', '')) } response = client().get( current_app.config.get("VIS_SERVICE_SOLR_PATH"), params=solr_args, headers=headers) if response.status_code == 200: data = response.json() else: return { "Error": "There was a connection error. Please try again later", "Error Info": response.text }, response.status_code if data: records = [ unicode(". ".join( d.get('title', '')[:current_app.config. get("VIS_SERVICE_WC_MAX_TITLE_SIZE")]) + ". " + d.get('abstract', '') [:current_app.config. get("VIS_SERVICE_WC_MAX_ABSTRACT_SIZE")]) for d in data["response"]["docs"] ] word_cloud_json = word_cloud.generate_wordcloud( records, n_most_common=current_app.config.get( "VIS_SERVICE_WC_MAX_WORDS"), n_threads=current_app.config.get("VIS_SERVICE_WC_THREADS"), accepted_pos=(u'NN', u'NNP', u'NNS', u'NNPS', u'JJ', u'RB', u'VB', u'VBD', u'VBG', u'VBN', u'VBP', u'VBZ')) if word_cloud_json: return word_cloud_json, 200 else: return { "Error": "Empty word cloud. Try changing your minimum word parameters or expanding your query." }, 200
def get(self): solr_args = dict(request.args) if 'max_groups' in solr_args: del solr_args['min_percent_word'] if 'min_occurrences_word' in solr_args: del solr_args['min_occurrences_word'] solr_args["rows"] = min( int( solr_args.get( "rows", [current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS") ])[0]), current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS")) solr_args['fields'] = ['id'] solr_args['defType'] = 'aqp' solr_args['tv'] = 'true' solr_args['tv.tf_idf'] = 'true' solr_args['tv.tf'] = 'true' solr_args['tv.positions'] = 'false' solr_args['tf.offsets'] = 'false' solr_args['tv.fl'] = 'abstract,title' solr_args['fl'] = 'id,abstract,title' solr_args['wt'] = 'json' headers = { 'X-Forwarded-Authorization': request.headers.get('Authorization') } response = client().get( current_app.config.get("VIS_SERVICE_TVRH_PATH"), params=solr_args, headers=headers) if response.status_code == 200: data = response.json() else: return { "Error": "There was a connection error. Please try again later", "Error Info": response.text }, response.status_code if data: min_percent_word = request.args.get( "min_percent_word", current_app.config.get("VIS_SERVICE_WC_MIN_PERCENT_WORD")) min_occurrences_word = request.args.get( "min_occurrences_word", current_app.config.get("VIS_SERVICE_WC_MIN_OCCURRENCES_WORD")) word_cloud_json = word_cloud.generate_wordcloud( data, min_percent_word=min_percent_word, min_occurrences_word=min_occurrences_word) if word_cloud_json: return word_cloud_json, 200 else: return { "Error": "Empty word cloud. Try changing your minimum word parameters or expanding your query." }, 200