import sys from Corpus import Corpus corpus = Corpus() dirr = input("Input directory of corpus:") # dirr = "resources/corpus_mini.txt" corpus.read(dirr) corpus.preprocess() corpus.ner() corpus.train_word2vec()
import dash_bootstrap_components as dbc import dash_html_components as html import plotly.express as px from dash.dependencies import Input, Output import plotly.graph_objects as go import plotly.figure_factory as ff from statsmodels.graphics.gofplots import qqplot import pandas as pd import numpy as np app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP]) corpus = Corpus() # khởi tạo dirr = "resources/vn_express.txt" # chọn đường dẫn corpus corpus.read(dirr) # đọc đường dẫn corpus.preprocess() # Tiền xử lí corpus.read_word2vec() # Đọc model word2vec để tìm từ đồng nghĩa corpus.read_ner() # Đọc dữ liệu đã xử lí tên thực thể và từ loại def genResult(res): result = [ html.P(children=sen, style={ 'backgroundColor': 'white', 'borderBottom': '2px solid #4F2992', 'margin': '30px', 'padding': '10px' }) for sen in res ]
import dash_core_components as dcc import dash_bootstrap_components as dbc import dash_html_components as html import plotly.express as px from dash.dependencies import Input, Output import plotly.graph_objects as go import plotly.figure_factory as ff from statsmodels.graphics.gofplots import qqplot import pandas as pd import numpy as np app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP]) corpus = Corpus() corpus.read('resources/vn_express.txt') corpus.preprocess() corpus.read_ner() # corpus.read_word2vec() df = pd.DataFrame(corpus.data_sent_segment, columns=['sentences']) # # đếm theo tiếng # df['len_tieng'] = df['sentences'].str.split() # df['len_tieng'] = df['len_tieng'].apply(len) # so_tieng = df['len_tieng'].sum() # cau_tu_tieng = pd.DataFrame(np.array([len(corpus.data_sent_segment), len(corpus.vocab.keys()), so_tieng])) # print(cau_tu_tieng) # Độ dài câu theo tiếng