Esempio n. 1
0
import os, sys, time
import glob
import types
import ipywidgets as widgets
import text_corpus
import domain_logic_vatican as domain_logic
import nltk
import pandas as pd
import zipfile

sys.path = list(set(['.', '..']) - set(sys.path)) + sys.path

import common.widgets_config as widgets_config
import common.utility as utility

logger = utility.getLogger('corpus_text_analysis')

from nltk.parse import corenlp

STANFORD_CORE_NLP_URL = 'http://localhost:9000'


def merge_entities(entities):
    n_entities = len(entities)
    if n_entities <= 1:
        return entities
    merged = entities[:1]
    for doc_id, i_n, w_n, t_n in entities[1:]:
        doc_id_p, i_p, w_p, t_p = merged[-1]
        if i_n == i_p + 1 and t_n == t_p:
            merged[-1] = (doc_id, i_n, '_'.join([w_p, w_n]), t_p)
import os
import shutil
import pandas as pd
import glob
import time
import zipfile

from common.utility import getLogger

logger = getLogger()


class FileUtility:
    def __init__(self, directory):
        self.directory = directory

    def create(self, clear_target_dir=False):

        if os.path.exists(self.directory) and clear_target_dir:
            shutil.rmtree(self.directory)

        if not os.path.exists(self.directory):
            os.makedirs(self.directory)

        return self

    @staticmethod
    def read_excel(filename, sheet):
        if not os.path.isfile(filename):
            raise Exception("File {0} does not exist!".format(filename))
        with pd.ExcelFile(filename) as xls:
Esempio n. 3
0
from IPython.display import display

import pandas as pd
import nltk

os.sys.path = os.sys.path if '..' in os.sys.path else os.sys.path + ['..']

import common.widgets_config as widgets_config
import common.config as config
import common.color_utility as color_utility
import common.utility as utility
import headnote_corpus

from pprint import pprint as pp

logger = utility.getLogger(name='title_analysis')

OUTPUT_OPTIONS = {
    'Table': 'table',
    'Table, grid': 'qgrid',
    'Table, unstacked': 'unstack',
    'Plot bar': 'plot_bar',
    'Plot stacked bar': 'plot_stacked_bar',
    'Plot line': 'plot_line',
    'Plot area': 'plot_area',
    'Plot stacked area': 'plot_stacked_area'
}

EXTRA_GROUPBY_OPTIONS = {
    '': None,
    'Topic': [ 'topic_category' ],
import ipywidgets as widgets
import pandas as pd
import common.config as config
import common.utility as utility
import common.treaty_utility as treaty_utility
import common.widgets_config as widgets_config
import common.color_utility as color_utility
import analysis_data
import analysis_plot

from IPython.display import display
from pprint import pprint as pp

logger = utility.getLogger('tq_by_topic')


def display_topic_quantity(period_group=0,
                           topic_group=None,
                           party_group=None,
                           recode_is_cultural=False,
                           normalize_values=False,
                           extra_other_category=False,
                           chart_type_name=None,
                           plot_style='classic',
                           target_quantity="topic",
                           wti_index=None,
                           progress=utility.noop):
    try:
        # print(locals())
        progress()
Esempio n. 5
0
from IPython.display import display

os.sys.path = os.sys.path if '..' in os.sys.path else os.sys.path + ['..']

import common.config as config
import common.widgets_config as widgets_config
import common.color_utility as color_utility
import common.utility as utility

from common.network.layout import layout_setups, layout_network
from common.network.networkx_utility import create_nx_subgraph, get_positioned_edges2, get_positioned_nodes
from network_analysis_plot import plot_network, get_palette
from network_analysis import create_party_network, slice_network_datasource, setup_node_size, adjust_node_label_offset

NETWORK_LAYOUT_OPTIONS = {x.name: x.key for x in layout_setups}
logger = utility.getLogger('network_analysis')
warnings.filterwarnings('ignore')

NETWORK_PLOT_OPTS = dict(
    x_axis_type=None,
    y_axis_type=None,
    background_fill_color='white',
    line_opts=dict(color='green', alpha=0.5),
    node_opts=dict(color=None, level='overlay', alpha=1.0),
)

NODE_SIZE_OPTIONS = {
    '(default)': None,
    'Degree centrality': 'degree',
    'Closeness centrality': 'closeness',
    'Betweenness centrality': 'betweenness',
import collections
import pandas as pd

from corpora.corpus_source_reader import SparvCorpusSourceReader
from corpora.zip_utility import ZipReader
from common.utility import getLogger, extend

logger = getLogger(__name__)

KindOfPoS = collections.namedtuple('KindOfPoS',
                                   'tag description is_deliminator')

SUC_POS_TAGS = {
    'AB':
    KindOfPoS(tag='AB',
              description={
                  'en': 'Adverb',
                  'se': 'Adverb'
              },
              is_deliminator=False),
    'DT':
    KindOfPoS(tag='DT',
              description={
                  'en': 'Determiner',
                  'se': 'Determinerare, bestämningsord'
              },
              is_deliminator=False),
    'HA':
    KindOfPoS(tag='HA',
              description={
                  'en': 'Interrogative/Relative Adverb',
Esempio n. 7
0
import ipywidgets as widgets
import itertools
import types
import pandas as pd
import common.widgets_config as widgets_config
import common.config as config
import common.utility as utility
import common.treaty_utility as treaty_utility
import common.color_utility as color_utility
import analysis_data
import analysis_plot
from pprint import pprint as pp
from IPython.display import display

logger = utility.getLogger('tq_by_party')

OTHER_CATEGORY_OPTIONS = {
    'Other category': 'other_category',
    'All category': 'all_category',
    'Nothing': ''
}


def display_quantity_by_party(period_group_index=0,
                              party_name='',
                              parties=None,
                              year_limit=None,
                              treaty_filter='',
                              extra_category='',
                              normalize_values=False,
                              chart_type_name=None,
from geopy.geocoders import GoogleV3# GeoNames, Nominatim, GoogleV3     # if explicit use of geopy
from . geocode_loc_tags import assign_geocodes, load_swener_tags, get_country
import numpy as np
import pandas as pd
import common.file_utility as file_utility
import common.utility as utility

logger = utility.getLogger(__name__)

def setup_unique_locations_dataframe(df_tags, geocoded_filename):

    df_locations = df_tags.loc[df_tags.category.str.contains('LOC'),['year', 'entity']]
    df = df_locations['entity'].drop_duplicates().to_frame()
    df['processed'] = np.nan
    df['latitude'] = np.nan
    df['longitude'] = np.nan
    df['reversename'] = np.nan
    df['country'] = np.nan
    df = df.set_index('entity')

    df_geocoded = file_utility.FileUtility.read_excel(filename=geocoded_filename, sheet='Sheet1').set_index('entity')
    return df.combine_first(df_geocoded)

def assign_country_to_locations(df):
    country_info = df['reversename'].map(lambda x: get_country(str(x)))
    df['country'] = country_info.map(lambda x: x.name if not x is None else None)
    df['country_code'] = country_info.map(lambda x: x.alpha_2 if not x is None else None)
    df['country_code3'] = country_info.map(lambda x: x.alpha_3 if not x is None else None)

def process_geocoding(df_tags, geolocator, geocoded_filename, geocoded_output_filename):
# -*- coding: utf-8 -*-

import os
import pandas as pd
from common.file_utility import FileUtility
import common.utility as utility

join = os.path.join
logger = utility.getLogger('NotebookDataGenerator')

class NotebookDataGenerator():
    """Class that prepares and extracts various data from LDA model.
    Main purpose is to prepare data for Jupyter notebooks
    """

    def __init__(self, store):
        self.store = store


    def _compile_dictionary(self, lda):
        logger.info('Compiling dictionary...')
        token_ids, tokens = list(zip(*lda.id2word.items()))
        dfs = lda.id2word.dfs.values() if lda.id2word.dfs is not None else [0] * len(tokens)
        dictionary = pd.DataFrame({
            'token_id': token_ids,
            'token': tokens,
            'dfs': list(dfs)
        }).set_index('token_id')[['token', 'dfs']]
        return dictionary

    def __compile_document_topics_iter(self, lda, mm, minimum_probability):
import ipywidgets as widgets
import pandas as pd
import common.config as config
import common.utility as utility
import common.treaty_utility as treaty_utility
import common.widgets_config as widgets_config
import common.color_utility as color_utility
import analysis_data
import analysis_plot
import logging
import types

from IPython.display import display
from pprint import pprint as pp

logger = utility.getLogger('tq_by_topic', level=logging.WARNING)


def display_topic_quantity(period_group=0,
                           topic_group=None,
                           party_group=None,
                           recode_is_cultural=False,
                           normalize_values=False,
                           extra_other_category=False,
                           chart_type_name=None,
                           plot_style='classic',
                           target_quantity="topic",
                           treaty_sources=None,
                           wti_index=None,
                           progress=utility.noop):
    try: