Example #1
0
def make_map(query, only_terms=False, file_format='svg',
             include_svg_dimensions=False, starting_year=None,
             ending_year=None, sample_size=None, evaluation_output_path=None,
             term_type=TermExtraction.Phrases, data_dump_path=None, n_layers=0, graph_attrs=None, **kwargs):
    documents = filter_query(query,
                             starting_year=starting_year,
                             ending_year=ending_year,
                             sample_size=sample_size)
    extracted_terms = extract_terms(documents, term_type)
    map_dict, graph_terms, phrase_frequencies, similarities, scored_phrases = map_representation(extracted_terms, data_dump_path=data_dump_path, **kwargs)
    print type(similarities)
    # map_string will be a graphviz-processable string
    print 'here'
    map_string = write_dot.output_pairs_dict(map_dict, False, true_scaling=True, phrase_frequencies=phrase_frequencies, similarities=similarities, phrase_scores=scored_phrases, n_layers=n_layers, graph_attrs=graph_attrs)
    print map_string

    if evaluation_output_path:
        import evaluation
        evaluation.plot_phrase_frequencies(phrase_frequencies, evaluation_output_path)
        evaluation.plot_edge_weight_distribution(map_dict, evaluation_output_path)

    if only_terms:
        return '\n'.join(sorted([' '.join(tpl) for tpl in graph_terms]))
    if file_format == 'raw':
        return map_string
    else:
        map_ = call_graphviz(map_string, file_format)
        if file_format == 'svg' and not include_svg_dimensions:
            return strip_dimensions(map_)
        else:
            return map_
Example #2
0
def make_basemap(basemap):
    try:
        set_status('getting document list', model=basemap)
        with ManagedSession() as session:
            filtered_query = create_query_for_model(session, basemap, dirty=False)
            extracted_terms = extract_terms(filtered_query, basemap.term_type)
        if not extracted_terms:
            raise Exception('No documents found matching query!')
        map_dict, graph_terms, phrase_frequencies, unnormed_dict, phrase_scores = map_representation(extracted_terms,
                                                                                                     ranking_algorithm=basemap.ranking_algorithm,
                                                                                                     similarity_algorithm=basemap.similarity_algorithm,
                                                                                                     filtering_algorithm=basemap.filtering_algorithm,
                                                                                                     number_of_terms=basemap.number_of_terms,
                                                                                                     model=basemap)
        # map_string will be a graphviz-processable string
        # map_string = write_dot.output_pairs_dict(map_dict, True, phrase_frequencies=phrase_frequencies, true_scaling=True).decode('ascii', 'ignore')
        map_string = write_dot.output_pairs_dict(map_dict, True, phrase_frequencies=phrase_frequencies, true_scaling=True, similarities=unnormed_dict, phrase_scores=phrase_scores).decode('ascii', 'ignore')
        # save to database
        basemap.dot_rep = map_string
        # basemap.phrase_frequencies = json.dumps(jsonize_phrase_dict(phrase_frequencies), indent=4).decode('ascii', 'ignore')
        # get phrases as a list of lists of strings (one list of words per term)
        basemap.phrases_in_map = json.dumps(jsonize_phrase_set(graph_terms, None)).decode('ascii', 'ignore')
        basemap.save()
        svg_str, width, height = strip_dimensions(call_graphviz(map_string, file_format='svg', model=basemap))
        basemap.svg_rep = svg_str
        basemap.width = width
        basemap.height = height
        basemap.finished = True
        basemap.save()
        set_status('basemap complete', model=basemap)
        print 'basemap complete'
        return map_dict, graph_terms
    except ZeroDivisionError as e:
        set_status('Error: too few documents to produce a map. Try a broader search', model=basemap)
Example #3
0
def make_map(query,
             only_terms=False,
             file_format='svg',
             include_svg_dimensions=False,
             starting_year=None,
             ending_year=None,
             sample_size=None,
             evaluation_output_path=None,
             term_type=TermExtraction.Phrases,
             data_dump_path=None,
             n_layers=0,
             graph_attrs=None,
             **kwargs):
    documents = filter_query(query,
                             starting_year=starting_year,
                             ending_year=ending_year,
                             sample_size=sample_size)
    extracted_terms = extract_terms(documents, term_type)
    map_dict, graph_terms, phrase_frequencies, similarities, scored_phrases = map_representation(
        extracted_terms, data_dump_path=data_dump_path, **kwargs)
    print type(similarities)
    # map_string will be a graphviz-processable string
    print 'here'
    map_string = write_dot.output_pairs_dict(
        map_dict,
        False,
        true_scaling=True,
        phrase_frequencies=phrase_frequencies,
        similarities=similarities,
        phrase_scores=scored_phrases,
        n_layers=n_layers,
        graph_attrs=graph_attrs)
    print map_string

    if evaluation_output_path:
        import evaluation
        evaluation.plot_phrase_frequencies(phrase_frequencies,
                                           evaluation_output_path)
        evaluation.plot_edge_weight_distribution(map_dict,
                                                 evaluation_output_path)

    if only_terms:
        return '\n'.join(sorted([' '.join(tpl) for tpl in graph_terms]))
    if file_format == 'raw':
        return map_string
    else:
        map_ = call_graphviz(map_string, file_format)
        if file_format == 'svg' and not include_svg_dimensions:
            return strip_dimensions(map_)
        else:
            return map_