from spacy.en import English
from spacy.symbols import *
from spacy.tokens.doc import Doc

#Initialize QuestionTypology class pretrained on Parliament Dataset

num_clusters = 8

data_dir = download('parliament-corpus')
motifs_dir = download('parliament-motifs')

corpus = Corpus(filename=os.path.join(data_dir, 'parliament-corpus'))

questionTypology = QuestionTypology(corpus,
                                    data_dir,
                                    dataset_name='parliament',
                                    motifs_dir=motifs_dir,
                                    num_dims=25,
                                    num_clusters=num_clusters,
                                    verbose=False,
                                    random_seed=164)

#Determine type of input question

example_question = "Does my right hon Friend agree that excellent regional universities—for example , the University of Northumbria at Newcastle and Sunderland—are anxious that they will be at a disadvantage if an élite group of universities , mainly in the south - east of England , are allowed to raise their fees to figures upwards of £ 10,000 a year , as today 's newspapers reported the Minister for Lifelong Learning and Higher Education as saying ?"
# example_question = "What is the minister going to do about?"
question_matrix, mtx, label = questionTypology.classify_question(
    example_question)
print('Question: ', example_question)
print('Cluster: ', label)
num_clusters = 8

# Get precomputed motifs. data_dir contains the downloaded data.
# motifs_dir is the specific path within data_dir that contains the precomputed motifs
data_dir = os.path.join(pkg_resources.resource_filename("convokit", ""),
                        'downloads')
motifs_dir = download('parliament-motifs')

#Load the corpus
corpus = Corpus(filename=download("parliament-corpus"))

#Extract clusters of the motifs and assign questions to these clusters
questionTypology = QuestionTypology(corpus,
                                    data_dir,
                                    dataset_name='parliament',
                                    motifs_dir=motifs_dir,
                                    num_dims=25,
                                    num_clusters=num_clusters,
                                    verbose=False,
                                    random_seed=164)

# questionTypology.types_to_data contains the necessary data that is computed in the step above
# its keys are the indices of the clusters (here 0-7). The values are dictionaries with the following keys:
# "motifs": the motifs, as a list of tuples of the motif terms
# "motif_dists": the corresponding distances of each motif from the centroid of the cluster this motif is in
# "fragments": the answer fragments, as a list of tuples of answer terms
# "fragment_dists": the corresponding distances of each fragment from the centroid of the cluster this
# fragment is in
# "questions": the IDs of the questions in this cluster. You can get the corresponding question text by using the
# get_question_text_from_pair_idx(pair_idx) method.
# "question_dists": the corresponding distances of each question from the centroid of the cluster
# this question is in
from spacy.tokens.doc import Doc

#Initialize QuestionTypology class pretrained on Parliament Dataset

num_clusters = 8

data_dir = os.path.join(pkg_resources.resource_filename("convokit", ""),
                        'downloads')
motifs_dir = os.path.join(data_dir, 'parliament-motifs')

corpus = Corpus(filename=download("parliament-corpus"))

questionTypology = QuestionTypology(corpus,
                                    data_dir,
                                    dataset_name='parliament',
                                    motifs_dir=motifs_dir,
                                    num_dims=25,
                                    num_clusters=num_clusters,
                                    verbose=False,
                                    random_seed=164)

#Preprocessing
#create spacy object
spacy_NLP = spacy.load('en')
vocab = English().vocab

question_fit_file = os.path.join(questionTypology.motifs_dir,
                                 'question_fits.json')

superset_file = os.path.join(questionTypology.motifs_dir,
                             'question_supersets_arcset_to_super.json')