Ejemplo n.º 1
0
  def get(self):

    solr_args = dict(request.args)
    if 'max_groups' in solr_args:
        del solr_args['min_percent_word']
    if 'min_occurrences_word' in solr_args:
        del solr_args['min_occurrences_word']

    solr_args["rows"] = min(int(solr_args.get("rows", [current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS")])[0]), current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS"))
    solr_args['fields'] = ['id']
    solr_args['defType'] = 'aqp'
    solr_args['tv'] = 'true'
    solr_args['tv.tf_idf'] = 'true'
    solr_args['tv.tf'] = 'true'
    solr_args['tv.positions'] ='false'
    solr_args['tf.offsets'] = 'false'
    solr_args['tv.fl'] ='abstract,title'
    solr_args['fl'] ='id,abstract,title'
    solr_args['wt'] = 'json'

    headers = {'X-Forwarded-Authorization' : request.headers.get('Authorization')}

    response = client().get(current_app.config.get("VIS_SERVICE_TVRH_PATH") , params = solr_args, headers=headers)

    if response.status_code == 200:
        data = response.json()
    else:
        return {"Error": "There was a connection error. Please try again later", "Error Info": response.text}, response.status_code

    if data:
        min_percent_word = request.args.get("min_percent_word", current_app.config.get("VIS_SERVICE_WC_MIN_PERCENT_WORD"))
        min_occurrences_word = request.args.get("min_occurrences_word", current_app.config.get("VIS_SERVICE_WC_MIN_OCCURRENCES_WORD"))

        word_cloud_json = word_cloud.generate_wordcloud(data, min_percent_word = min_percent_word, min_occurrences_word = min_occurrences_word)
    if word_cloud_json:
        return word_cloud_json, 200
    else:
        return {"Error": "Empty word cloud. Try changing your minimum word parameters or expanding your query."}, 200
Ejemplo n.º 2
0
    def post(self):

        solr_args = request.json
        if not solr_args:
            return {'Error' : 'there was a problem with your request', 'Error Info': 'no data provided with request'}, 403

        if 'max_groups' in solr_args:
            del solr_args['min_percent_word']
        if 'min_occurrences_word' in solr_args:
            del solr_args['min_occurrences_word']

        elif 'query' in request.json:
            try:
                solr_args = json.loads(request.json["query"][0])
            except Exception:
                return {'Error' : 'there was a problem with your request', 'Error Info': 'couldn\'t decode query, it should be json-encoded before being sent (so double encoded)'}, 403

        solr_args["rows"] = min(int(solr_args.get("rows", [current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS")])[0]), current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS"))
        solr_args['fl'] ='abstract,title'
        solr_args['wt'] = 'json'

        headers = {'X-Forwarded-Authorization' : request.headers.get('Authorization')}

        response = client().get(current_app.config.get("VIS_SERVICE_SOLR_PATH") , params = solr_args, headers=headers)

        if response.status_code == 200:
            data = response.json()
        else:
            return {"Error": "There was a connection error. Please try again later", "Error Info": response.text}, response.status_code

        if data:
            records = [unicode(". ".join(d.get('title', '')[:current_app.config.get("VIS_SERVICE_WC_MAX_TITLE_SIZE")]) + ". " + d.get('abstract', '')[:current_app.config.get("VIS_SERVICE_WC_MAX_ABSTRACT_SIZE")]) for d in data["response"]["docs"]]
            word_cloud_json = word_cloud.generate_wordcloud(records, n_most_common=current_app.config.get("VIS_SERVICE_WC_MAX_WORDS"), n_threads=current_app.config.get("VIS_SERVICE_WC_THREADS"), accepted_pos=(u'NN', u'NNP', u'NNS', u'NNPS', u'JJ', u'RB', u'VB', u'VBD', u'VBG', u'VBN', u'VBP', u'VBZ'))

        if word_cloud_json:
            return word_cloud_json, 200
        else:
            return {"Error": "Empty word cloud. Try changing your minimum word parameters or expanding your query."}, 200
Ejemplo n.º 3
0
  def test_word_cloud_resource(self):

    self.maxDiff = None

    # function: add_punc_and_remove_redundancies 
    # uses the text returned from solr to do some cleaning up of the idf info returned by solr,
    # reducing counts of token components of slashed or dashed words
    # after this point the solr text is ignored, only the tf/idf data is used

    tf_idf_dict = {'word':{'tf' :[3], 'tf-idf' : [0.5]}, 'dashed' : {'tf' :[1], 'tf-idf' : [0.5]}, 'slashed' : {'tf' :[1], 'tf-idf' : [0.5]}, 'dashedword' : {'tf' :[1], 'tf-idf' : [0.5]}, 'slashedword' : {'tf' :[1], 'tf-idf' : [0.5]}}

    text_list = ['word', 'dashed-word', 'slashed/word']

    updated_info_dict = word_cloud.add_punc_and_remove_redundancies(tf_idf_dict, text_list)

    expected_outcome_info_dict = {'word':{'tf' :[1], 'tf-idf' : [0.5]}, 'dashed-word': {'tf' :[1], 'tf-idf' : [0.5]}, 'slashed/word' : {'tf' :[1], 'tf-idf' : [0.5]}, 'dashed' : {'tf' :[-1], 'tf-idf' : [0.5]}, 'slashed' : {'tf' :[0], 'tf-idf' : [0.5]}}

    self.assertEqual(updated_info_dict, expected_outcome_info_dict)

    # function: build_dict 
    # is a parent function to add_punc_and_remove_redundancies that takes an tf idf info and text info
    # and returns a token and acronym dictionary. The token dictionary is grouped by stem and includes
    # a list of idf for each different word


    tf_idf_dict = {
      'fakeId': {
      'abstract': {
        'word': {
        'tf': [3],
        'tf-idf': [0.5]
        },
        'dashed': {
        'tf': [1],
        'tf-idf': [0.5]
        },
        'slashed': {
        'tf': [1],
        'tf-idf': [0.5]
        },
        'dashedword': {
        'tf': [1],
        'tf-idf': [0.5]
        },
        'slashedword': {
        'tf': [1],
        'tf-idf': [0.5]
        }
      },
      'title': {
        'research': {
        'tf': [1],
        'tf-idf': [0.1]
        },
        'researcher': {
        'tf': [1],
        'tf-idf': [0.9]
        },
        'acr::fake': {
        'tf': [1],
        'tf-idf': [0.5]
        }
      }
      }
    }

    text_list = [{'id': 'fakeId', 'abstract': 'word dashed-word slashed/word', 'title' : 'research researcher FAKE'}]

    
    expected_outcome_info_dict = ({'dashedword': {'idf': [0.5], 'tokens': {'dashed-word': 1},  'record_count' : ['fakeId']},
    'research': {'idf': [0.9, 0.1], 'tokens': {'research': 1, 'researcher': 1},  'record_count' : ['fakeId', 'fakeId']},
    'slashedword': {'idf': [0.5], 'tokens': {'slashed/word': 1},  'record_count' : ['fakeId']},
    'word': {'idf': [0.5], 'tokens': {'word': 1}, 'record_count' : ['fakeId']}},
    {'FAKE': {'idf': [0.5], 'total_occurrences': 1, 'record_count' : ['fakeId']}})


    updated_info_dict = word_cloud.build_dict(tf_idf_dict, text_list)

    self.assertEqual(updated_info_dict, expected_outcome_info_dict)


    #function: combine_and_process_dicts
    #uses the expected outcome from the previous function

    combined_dict = word_cloud.combine_and_process_dicts(expected_outcome_info_dict[0], expected_outcome_info_dict[1])

    expected_combined_dict = {
    'dashed-word': {'idf': 0.5, 'total_occurrences' :1, 'record_count' :1 },
    'research' : {'idf': 0.5, 'total_occurrences' :2, 'record_count' :1 },
    'slashed/word':{'idf': 0.5, 'total_occurrences' :1, 'record_count' :1 },
    'word': {'idf': 0.5, 'total_occurrences' :1, 'record_count' :1 },
    'FAKE' : {'idf': 0.5, 'total_occurrences' :1, 'record_count' :1 }
    }

    self.assertEqual(combined_dict, expected_combined_dict)

     #testing the main word cloud generation function with large data

    processed_data = word_cloud.generate_wordcloud(input_js_word_cloud, min_occurrences_word=2, min_percent_word=3)

    self.assertEqual(json.loads(json.dumps(processed_data)), test_js_word_cloud)

    processed_data = word_cloud.generate_wordcloud(input_js_word_cloud, min_occurrences_word=5, min_percent_word=3)

    self.assertEqual(json.loads(json.dumps(processed_data)), test_json_word_cloud_min_occurrences)
Ejemplo n.º 4
0
    def test_word_cloud_resource(self):

        self.maxDiff = None

        # function: add_punc_and_remove_redundancies
        # uses the text returned from solr to do some cleaning up of the idf info returned by solr,
        # reducing counts of token components of slashed or dashed words
        # after this point the solr text is ignored, only the tf/idf data is used

        tf_idf_dict = {
            'word': {
                'tf': [3],
                'tf-idf': [0.5]
            },
            'dashed': {
                'tf': [1],
                'tf-idf': [0.5]
            },
            'slashed': {
                'tf': [1],
                'tf-idf': [0.5]
            },
            'dashedword': {
                'tf': [1],
                'tf-idf': [0.5]
            },
            'slashedword': {
                'tf': [1],
                'tf-idf': [0.5]
            }
        }

        text_list = ['word', 'dashed-word', 'slashed/word']

        updated_info_dict = word_cloud.add_punc_and_remove_redundancies(
            tf_idf_dict, text_list)

        expected_outcome_info_dict = {
            'word': {
                'tf': [1],
                'tf-idf': [0.5]
            },
            'dashed-word': {
                'tf': [1],
                'tf-idf': [0.5]
            },
            'slashed/word': {
                'tf': [1],
                'tf-idf': [0.5]
            },
            'dashed': {
                'tf': [-1],
                'tf-idf': [0.5]
            },
            'slashed': {
                'tf': [0],
                'tf-idf': [0.5]
            }
        }

        self.assertEqual(updated_info_dict, expected_outcome_info_dict)

        # function: build_dict
        # is a parent function to add_punc_and_remove_redundancies that takes an tf idf info and text info
        # and returns a token and acronym dictionary. The token dictionary is grouped by stem and includes
        # a list of idf for each different word

        tf_idf_dict = {
            'fakeId': {
                'abstract': {
                    'word': {
                        'tf': [3],
                        'tf-idf': [0.5]
                    },
                    'dashed': {
                        'tf': [1],
                        'tf-idf': [0.5]
                    },
                    'slashed': {
                        'tf': [1],
                        'tf-idf': [0.5]
                    },
                    'dashedword': {
                        'tf': [1],
                        'tf-idf': [0.5]
                    },
                    'slashedword': {
                        'tf': [1],
                        'tf-idf': [0.5]
                    }
                },
                'title': {
                    'research': {
                        'tf': [1],
                        'tf-idf': [0.1]
                    },
                    'researcher': {
                        'tf': [1],
                        'tf-idf': [0.9]
                    },
                    'acr::fake': {
                        'tf': [1],
                        'tf-idf': [0.5]
                    }
                }
            }
        }

        text_list = [{
            'id': 'fakeId',
            'abstract': 'word dashed-word slashed/word',
            'title': 'research researcher FAKE'
        }]

        expected_outcome_info_dict = ({
            'dashedword': {
                'idf': [0.5],
                'tokens': {
                    'dashed-word': 1
                },
                'record_count': ['fakeId']
            },
            'research': {
                'idf': [0.9, 0.1],
                'tokens': {
                    'research': 1,
                    'researcher': 1
                },
                'record_count': ['fakeId', 'fakeId']
            },
            'slashedword': {
                'idf': [0.5],
                'tokens': {
                    'slashed/word': 1
                },
                'record_count': ['fakeId']
            },
            'word': {
                'idf': [0.5],
                'tokens': {
                    'word': 1
                },
                'record_count': ['fakeId']
            }
        }, {
            'FAKE': {
                'idf': [0.5],
                'total_occurrences': 1,
                'record_count': ['fakeId']
            }
        })

        updated_info_dict = word_cloud.build_dict(tf_idf_dict, text_list)

        self.assertEqual(updated_info_dict, expected_outcome_info_dict)

        #function: combine_and_process_dicts
        #uses the expected outcome from the previous function

        combined_dict = word_cloud.combine_and_process_dicts(
            expected_outcome_info_dict[0], expected_outcome_info_dict[1])

        expected_combined_dict = {
            'dashed-word': {
                'idf': 0.5,
                'total_occurrences': 1,
                'record_count': 1
            },
            'research': {
                'idf': 0.5,
                'total_occurrences': 2,
                'record_count': 1
            },
            'slashed/word': {
                'idf': 0.5,
                'total_occurrences': 1,
                'record_count': 1
            },
            'word': {
                'idf': 0.5,
                'total_occurrences': 1,
                'record_count': 1
            },
            'FAKE': {
                'idf': 0.5,
                'total_occurrences': 1,
                'record_count': 1
            }
        }

        self.assertEqual(combined_dict, expected_combined_dict)

        #testing the main word cloud generation function with large data

        processed_data = word_cloud.generate_wordcloud(input_js_word_cloud,
                                                       min_occurrences_word=2,
                                                       min_percent_word=3)

        self.assertEqual(json.loads(json.dumps(processed_data)),
                         test_js_word_cloud)

        processed_data = word_cloud.generate_wordcloud(input_js_word_cloud,
                                                       min_occurrences_word=5,
                                                       min_percent_word=3)

        self.assertEqual(json.loads(json.dumps(processed_data)),
                         test_json_word_cloud_min_occurrences)
Ejemplo n.º 5
0
    def post(self):

        solr_args = request.json
        if not solr_args:
            return {
                'Error': 'there was a problem with your request',
                'Error Info': 'no data provided with request'
            }, 403

        if 'max_groups' in solr_args:
            del solr_args['min_percent_word']
        if 'min_occurrences_word' in solr_args:
            del solr_args['min_occurrences_word']

        elif 'query' in request.json:
            try:
                solr_args = json.loads(request.json["query"][0])
            except Exception:
                return {
                    'Error':
                    'there was a problem with your request',
                    'Error Info':
                    'couldn\'t decode query, it should be json-encoded before being sent (so double encoded)'
                }, 403

        solr_args["rows"] = min(
            int(
                solr_args.get(
                    "rows",
                    [current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS")
                     ])[0]),
            current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS"))
        solr_args['fields'] = ['id']
        solr_args['defType'] = 'aqp'
        solr_args['tv'] = 'true'
        solr_args['tv.tf_idf'] = 'true'
        solr_args['tv.tf'] = 'true'
        solr_args['tv.positions'] = 'false'
        solr_args['tf.offsets'] = 'false'
        solr_args['tv.fl'] = 'abstract,title'
        solr_args['fl'] = 'id,abstract,title'
        solr_args['wt'] = 'json'

        headers = {
            'X-Forwarded-Authorization': request.headers.get('Authorization')
        }

        response = client().get(
            current_app.config.get("VIS_SERVICE_TVRH_PATH"),
            params=solr_args,
            headers=headers)

        if response.status_code == 200:
            data = response.json()
        else:
            return {
                "Error":
                "There was a connection error. Please try again later",
                "Error Info": response.text
            }, response.status_code

        if data:
            min_percent_word = request.args.get(
                "min_percent_word",
                current_app.config.get("VIS_SERVICE_WC_MIN_PERCENT_WORD"))
            min_occurrences_word = request.args.get(
                "min_occurrences_word",
                current_app.config.get("VIS_SERVICE_WC_MIN_OCCURRENCES_WORD"))

            word_cloud_json = word_cloud.generate_wordcloud(
                data,
                min_percent_word=min_percent_word,
                min_occurrences_word=min_occurrences_word)
        if word_cloud_json:
            return word_cloud_json, 200
        else:
            return {
                "Error":
                "Empty word cloud. Try changing your minimum word parameters or expanding your query."
            }, 200
Ejemplo n.º 6
0
    def post(self):

        solr_args = request.json
        if not solr_args:
            return {
                'Error': 'there was a problem with your request',
                'Error Info': 'no data provided with request'
            }, 403

        if 'max_groups' in solr_args:
            del solr_args['min_percent_word']
        if 'min_occurrences_word' in solr_args:
            del solr_args['min_occurrences_word']

        elif 'query' in request.json:
            try:
                solr_args = json.loads(request.json["query"][0])
            except Exception:
                return {
                    'Error':
                    'there was a problem with your request',
                    'Error Info':
                    'couldn\'t decode query, it should be json-encoded before being sent (so double encoded)'
                }, 403

        solr_args["rows"] = min(
            int(
                solr_args.get(
                    "rows",
                    [current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS")
                     ])[0]),
            current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS"))
        solr_args['fl'] = 'abstract,title'
        solr_args['wt'] = 'json'

        headers = {
            'Authorization':
            request.headers.get('X-Forwarded-Authorization',
                                request.headers.get('Authorization', ''))
        }

        response = client().get(
            current_app.config.get("VIS_SERVICE_SOLR_PATH"),
            params=solr_args,
            headers=headers)

        if response.status_code == 200:
            data = response.json()
        else:
            return {
                "Error":
                "There was a connection error. Please try again later",
                "Error Info": response.text
            }, response.status_code

        if data:
            records = [
                unicode(". ".join(
                    d.get('title', '')[:current_app.config.
                                       get("VIS_SERVICE_WC_MAX_TITLE_SIZE")]) +
                        ". " + d.get('abstract', '')
                        [:current_app.config.
                         get("VIS_SERVICE_WC_MAX_ABSTRACT_SIZE")])
                for d in data["response"]["docs"]
            ]
            word_cloud_json = word_cloud.generate_wordcloud(
                records,
                n_most_common=current_app.config.get(
                    "VIS_SERVICE_WC_MAX_WORDS"),
                n_threads=current_app.config.get("VIS_SERVICE_WC_THREADS"),
                accepted_pos=(u'NN', u'NNP', u'NNS', u'NNPS', u'JJ', u'RB',
                              u'VB', u'VBD', u'VBG', u'VBN', u'VBP', u'VBZ'))

        if word_cloud_json:
            return word_cloud_json, 200
        else:
            return {
                "Error":
                "Empty word cloud. Try changing your minimum word parameters or expanding your query."
            }, 200
Ejemplo n.º 7
0
    def get(self):

        solr_args = dict(request.args)
        if 'max_groups' in solr_args:
            del solr_args['min_percent_word']
        if 'min_occurrences_word' in solr_args:
            del solr_args['min_occurrences_word']

        solr_args["rows"] = min(
            int(
                solr_args.get(
                    "rows",
                    [current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS")
                     ])[0]),
            current_app.config.get("VIS_SERVICE_WC_MAX_RECORDS"))
        solr_args['fields'] = ['id']
        solr_args['defType'] = 'aqp'
        solr_args['tv'] = 'true'
        solr_args['tv.tf_idf'] = 'true'
        solr_args['tv.tf'] = 'true'
        solr_args['tv.positions'] = 'false'
        solr_args['tf.offsets'] = 'false'
        solr_args['tv.fl'] = 'abstract,title'
        solr_args['fl'] = 'id,abstract,title'
        solr_args['wt'] = 'json'

        headers = {
            'X-Forwarded-Authorization': request.headers.get('Authorization')
        }

        response = client().get(
            current_app.config.get("VIS_SERVICE_TVRH_PATH"),
            params=solr_args,
            headers=headers)

        if response.status_code == 200:
            data = response.json()
        else:
            return {
                "Error":
                "There was a connection error. Please try again later",
                "Error Info": response.text
            }, response.status_code

        if data:
            min_percent_word = request.args.get(
                "min_percent_word",
                current_app.config.get("VIS_SERVICE_WC_MIN_PERCENT_WORD"))
            min_occurrences_word = request.args.get(
                "min_occurrences_word",
                current_app.config.get("VIS_SERVICE_WC_MIN_OCCURRENCES_WORD"))

            word_cloud_json = word_cloud.generate_wordcloud(
                data,
                min_percent_word=min_percent_word,
                min_occurrences_word=min_occurrences_word)
        if word_cloud_json:
            return word_cloud_json, 200
        else:
            return {
                "Error":
                "Empty word cloud. Try changing your minimum word parameters or expanding your query."
            }, 200