from spacy.en import English from spacy.symbols import * from spacy.tokens.doc import Doc #Initialize QuestionTypology class pretrained on Parliament Dataset num_clusters = 8 data_dir = download('parliament-corpus') motifs_dir = download('parliament-motifs') corpus = Corpus(filename=os.path.join(data_dir, 'parliament-corpus')) questionTypology = QuestionTypology(corpus, data_dir, dataset_name='parliament', motifs_dir=motifs_dir, num_dims=25, num_clusters=num_clusters, verbose=False, random_seed=164) #Determine type of input question example_question = "Does my right hon Friend agree that excellent regional universities—for example , the University of Northumbria at Newcastle and Sunderland—are anxious that they will be at a disadvantage if an élite group of universities , mainly in the south - east of England , are allowed to raise their fees to figures upwards of £ 10,000 a year , as today 's newspapers reported the Minister for Lifelong Learning and Higher Education as saying ?" # example_question = "What is the minister going to do about?" question_matrix, mtx, label = questionTypology.classify_question( example_question) print('Question: ', example_question) print('Cluster: ', label)
num_clusters = 8 # Get precomputed motifs. data_dir contains the downloaded data. # motifs_dir is the specific path within data_dir that contains the precomputed motifs data_dir = os.path.join(pkg_resources.resource_filename("convokit", ""), 'downloads') motifs_dir = download('parliament-motifs') #Load the corpus corpus = Corpus(filename=download("parliament-corpus")) #Extract clusters of the motifs and assign questions to these clusters questionTypology = QuestionTypology(corpus, data_dir, dataset_name='parliament', motifs_dir=motifs_dir, num_dims=25, num_clusters=num_clusters, verbose=False, random_seed=164) # questionTypology.types_to_data contains the necessary data that is computed in the step above # its keys are the indices of the clusters (here 0-7). The values are dictionaries with the following keys: # "motifs": the motifs, as a list of tuples of the motif terms # "motif_dists": the corresponding distances of each motif from the centroid of the cluster this motif is in # "fragments": the answer fragments, as a list of tuples of answer terms # "fragment_dists": the corresponding distances of each fragment from the centroid of the cluster this # fragment is in # "questions": the IDs of the questions in this cluster. You can get the corresponding question text by using the # get_question_text_from_pair_idx(pair_idx) method. # "question_dists": the corresponding distances of each question from the centroid of the cluster # this question is in
from spacy.tokens.doc import Doc #Initialize QuestionTypology class pretrained on Parliament Dataset num_clusters = 8 data_dir = os.path.join(pkg_resources.resource_filename("convokit", ""), 'downloads') motifs_dir = os.path.join(data_dir, 'parliament-motifs') corpus = Corpus(filename=download("parliament-corpus")) questionTypology = QuestionTypology(corpus, data_dir, dataset_name='parliament', motifs_dir=motifs_dir, num_dims=25, num_clusters=num_clusters, verbose=False, random_seed=164) #Preprocessing #create spacy object spacy_NLP = spacy.load('en') vocab = English().vocab question_fit_file = os.path.join(questionTypology.motifs_dir, 'question_fits.json') superset_file = os.path.join(questionTypology.motifs_dir, 'question_supersets_arcset_to_super.json')