import sys
from Corpus import Corpus

corpus = Corpus()
dirr = input("Input directory of corpus:")
# dirr = "resources/corpus_mini.txt"
corpus.read(dirr)
corpus.preprocess()
corpus.ner()
corpus.train_word2vec()







import dash_bootstrap_components as dbc
import dash_html_components as html
import plotly.express as px
from dash.dependencies import Input, Output
import plotly.graph_objects as go
import plotly.figure_factory as ff
from statsmodels.graphics.gofplots import qqplot

import pandas as pd
import numpy as np

app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

corpus = Corpus()  # khởi tạo
dirr = "resources/vn_express.txt"  # chọn đường dẫn corpus
corpus.read(dirr)  # đọc đường dẫn
corpus.preprocess()  # Tiền xử lí
corpus.read_word2vec()  # Đọc model word2vec để tìm từ đồng nghĩa
corpus.read_ner()  # Đọc dữ liệu đã xử lí tên thực thể và từ loại


def genResult(res):
    result = [
        html.P(children=sen,
               style={
                   'backgroundColor': 'white',
                   'borderBottom': '2px solid #4F2992',
                   'margin': '30px',
                   'padding': '10px'
               }) for sen in res
    ]
import dash_core_components as dcc
import dash_bootstrap_components as dbc
import dash_html_components as html
import plotly.express as px
from dash.dependencies import Input, Output
import plotly.graph_objects as go
import plotly.figure_factory as ff
from statsmodels.graphics.gofplots import qqplot

import pandas as pd
import numpy as np

app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])

corpus = Corpus()
corpus.read('resources/vn_express.txt')
corpus.preprocess()
corpus.read_ner()
# corpus.read_word2vec()

df = pd.DataFrame(corpus.data_sent_segment, columns=['sentences'])

# # đếm theo tiếng
# df['len_tieng'] = df['sentences'].str.split()
# df['len_tieng'] = df['len_tieng'].apply(len)

# so_tieng = df['len_tieng'].sum()
# cau_tu_tieng = pd.DataFrame(np.array([len(corpus.data_sent_segment), len(corpus.vocab.keys()), so_tieng]))
# print(cau_tu_tieng)

# Độ dài câu theo tiếng