def Summarize(self, x = 1): for meeting in self.transcripts: print('\n\n\n\n\nMeeting ' + str(x) + ' ...') #preprocessing prep = Preprocessing() prep.Preprocess(meeting) print("Preprocessing completed ...") #frequency vectors freq = FrequencyMeasures(prep.meetingHisto, prep.singleWords, self.histograms['ListWordsVector'], prep.numSpeakers) freq.GetAll() print("Frequencies computed ...") #functional segmentation segm = FuncSegm(prep, freq.suidf, prep.numSpeakers) segm.Segmentation() print("Segmentation completed ...") #keywords keyw = Extractor(prep, segm, freq.idf) keyw.ExtractKeywords() print("Keywords extracted ...") #check if monologue or dialogue and apply specific method localSummary = [] i = 1 for dstr in segm.speakerDistr: if len(segm.cleanSentences[i-1]) > 1: if (np.sum(dstr) == 1): mon = Monologue(segm, keyw, i-1) mon.Summarize() localSummary.append(mon.summary) print("Monologue summarized ...") else: dial = Dialogue(prep, segm, self.histograms, self.topicModels, freq.suidf, freq.tfidfSpeak, i-1) dial.Summarize() localSummary.append(dial.summary) print("Dialogue summarized ...") elif len(segm.cleanSentences[i-1]) == 1: localSummary.append(str(segm.cleanSentOrig[i-1])) else: ... i += 1 #join, save and append the final summary txtSummary = ' '.join(localSummary) Help.SaveFileTxt(txtSummary, 'summary_' + str(x), self.resultPath) x += 1 self.summaries.append(txtSummary) print("Summary stored ...") print("Dataset summarized!!!")
def ReadReferences(self): reference = [] prep = Preprocessing() for file in os.listdir(self.referencePath): filename = ''.join([self.referencePath, file]) partText = open(filename) reference.append(''.join( partText.readlines())) #meeting name up to 7th character return reference
def get_pipeline_results(): # preload preprocessor preprocessor = Preprocessing() # preload models models = { #'Doc2Vec': Embedding(method='Doc2Vec'), #'TfIdf': Embedding(method='TfIdf', dict_path='../models/dict/dict.joblib', model_path='../models/tfidf/tfidf.joblib'), 'HDP': Embedding(method='HDP', dict_path='../models/dict/dict.joblib', model_path='../models/hdp/hdp.joblib'), #'BERT': Embedding(method='BERT') } print('Finished loading') with open('projects.json') as json_file: projects = json.load(json_file) descriptions = [i["description"] for i in projects] print(projects[0]["title"]) for seed in [3, 5, 7]: for p in range(3, 19): for l in range(10, 101, 10): pipe = Pipeline([('Preprocessing', preprocessor), ('Embedding', models['HDP']), ('EmbeddingData', Debug()), ('PlaneReduction', PlaneReduction(2, method='TSNE', perplexity=p, learning_rate=l, metric=jensenshannon, random_state=seed))]) tfs_plane = pipe.fit_transform(descriptions) tfs_mapped = mapToSpaceSampling(tfs_plane) uncertainty = entropy(pipe.named_steps.EmbeddingData.data, axis=1) dump = [{ 'mappoint': mappoint, 'project_data': project, 'entropy': entropy, "lr": l, "perp": p } for mappoint, project, entropy in zip( tfs_plane.tolist(), projects, uncertainty.tolist())] with open( 'pipeline-results/lr' + str(l) + '_p' + str(p) + '_seed' + str(seed) + '_HDP.txt', 'w') as filehandle: json.dump(dump, filehandle) print('compute samples with perplexity ' + str(p)) print("computation complete")
# document embedding from gensim.models import TfidfModel, HdpModel from gensim.corpora import Dictionary from gensim.matutils import corpus2csc from gensim.models.doc2vec import Doc2Vec, TaggedDocument from gensim.test.utils import get_tmpfile import bz2 from Preprocessing.preprocessing import Preprocessing import itertools print('Loading and preprocessing data') with bz2.open('/gepris_data/train.csv.bz2', mode='rt') as f: csvreader = csv.reader(f) traindata = Preprocessing().fit_transform((row[1] for row in csvreader)) f.seek(0) doc2author = {i:row[3:] for i, row in enumerate(csvreader)} print('Building dict and training TfIdf model:') dct = Dictionary(doc for doc in traindata) # fit dictionary tfidf_model = TfidfModel((dct.doc2bow(doc) for doc in traindata)) # fit model dump(tfidf_model, '/models/tfidf/tfidf.joblib') dump(dct, '/models/dict/dict.joblib') print('HDP training:') hdp_model = HdpModel([dct.doc2bow(doc) for doc in traindata], id2word=dct) dump(hdp_model, '/models/hdp/hdp.joblib')
def load_data(): print("..Loading the data") print("....Glove") vectors = Vectors(**{ 'file_name': config.embedding_file, 'embedding_dim': 300 }).glove() print("....Setting up preprocessor") with p_utils.add_indent(3): if config.preprocessor == "Ekphrasis": preprocessor = EkphrasisProxy().preprocess_text elif config.preprocessor == "NL-FIIT": preprocessor = Preprocessing().process_test else: raise NotImplementedError print("....Data") raw_data = pd.read_csv(config.train_file, sep="\t", header=0, quoting=3).sample(frac=1) rawish_data = raw_data['turn1'] + "\t" + raw_data[ 'turn2'] + "\t" + raw_data['turn3'] labels_as_indices = [ config.labels_to_index[label] for label in raw_data['label'] ] if config.train_val_test_split is not None: rawish_train = rawish_data.iloc[0:int(raw_data.shape[0] * config.train_val_test_split[0])] rawish_val = rawish_data.iloc[int(raw_data.shape[0] * config.train_val_test_split[0]): int(raw_data.shape[0] * (config.train_val_test_split[0] + config.train_val_test_split[1]))] labels_val = labels_as_indices[int(raw_data.shape[0] * config.train_val_test_split[0]): int(raw_data.shape[0] * (config.train_val_test_split[0] + config.train_val_test_split[1]))] labels_train = labels_as_indices[0:int(raw_data.shape[0] * config.train_val_test_split[0])] else: rawish_train = rawish_data labels_train = labels_as_indices raw_val = pd.read_csv(config.validation_file, sep="\t", header=0, quoting=3) rawish_val = raw_val['turn1'] + "\t" + raw_val[ 'turn2'] + "\t" + raw_val['turn3'] labels_val = [ config.labels_to_index[label] for label in raw_val['label'] ] if config.test: if config.train_val_test_split is not None and len( config.train_val_test_split) > 2: rawish_test = rawish_data.iloc[int(raw_data.shape[0] * ( config.train_val_test_split[0] + config.train_val_test_split[1])):] labels_test = labels_as_indices[int(raw_data.shape[0] * ( config.train_val_test_split[0] + config.train_val_test_split[1])):] else: raw_test = pd.read_csv(config.test_file_w_labels, sep="\t", header=0, quoting=3) rawish_test = raw_test['turn1'] + "\t" + raw_test[ 'turn2'] + "\t" + raw_test['turn3'] labels_test = [ config.labels_to_index[label] for label in raw_test['label'] ] with p_utils.add_indent(3, mode='err'): train_data = MultiTurnClassificationDataset( rawish_train, labels_train, vocab=vectors[0], preprocessing=preprocessor, split_turns=config.split_turns) val_data = MultiTurnClassificationDataset( rawish_val, labels_val, vocab=vectors[0], preprocessing=preprocessor, split_turns=config.split_turns) train_generator = pytoune_generator(train_data, config.turns) val_generator = pytoune_generator(val_data, config.turns) result = (train_generator, val_generator) if config.test: with p_utils.add_indent(3, mode='err'): test_data = MultiTurnClassificationDataset( rawish_test, labels_test, vocab=vectors[0], preprocessing=preprocessor, split_turns=config.split_turns) test_generator = pytoune_generator(test_data, config.turns, allow_reset=False) result += (test_generator, labels_test) if config.submission_file is not None: raw_submission = pd.read_csv(config.test_file_wo_labels, sep="\t", header=0, quoting=3) rawish_submission = raw_submission['turn1'] + "\t" + raw_submission[ 'turn2'] + "\t" + raw_submission['turn3'] with p_utils.add_indent(3, mode='err'): submission_data = MultiTurnClassificationDataset( rawish_submission, y=None, vocab=vectors[0], preprocessing=preprocessor, split_turns=config.split_turns) submission_generator = pytoune_generator(submission_data, config.turns, allow_reset=False, return_labels=False) return result + (vectors, ) + (raw_submission, submission_generator) return result + (vectors, )
from sklearn.metrics import silhouette_samples from sklearn.pipeline import Pipeline import numpy as np from bokeh.palettes import d3 from Preprocessing.preprocessing import Preprocessing from Embedding.embedding import Embedding from Topicextraction.topicextraction import TopicExtraction from Clustering.clustering import Clustering from Planereduction.planereduction import PlaneReduction from Linearization.linearization import mapToSpaceSampling, computeClusterTopography from Debug.debug import Debug # preload preprocessor print('Starting to load the NLP engine') preprocessor = Preprocessing() # preload models print('Starting to load the models') models = { #'Doc2Vec': Embedding(method='Doc2Vec'), 'TfIdf': Embedding(method='TfIdf', dict_path='../../../assets/models/tfidf/dict.lzma', model_path='../../../assets/models/tfidf/tfidf.lzma'), #'BERT': Embedding(method='BERT') } print('Finished loading')
from Preprocessing.preprocessing import Preprocessing from Embedding.embedding import Embedding from Planereduction.planereduction import PlaneReduction from Linearization.linearization import mapToSpaceSampling, computeClusterTopography from Debug.debug import Debug app = FastAPI() app.add_middleware(GZipMiddleware) class Model(str, Enum): HDP = "HDP" preprocessing = Preprocessing(workers=1) models = {'HDP': Embedding(method='HDP')} @app.post("/embedding") def topic_extraction(descriptions: List[str], method: Model = Model.HDP): """ This method ties all steps of the topic extraction together and performs the computations which are send to its endpoint. :param descriptions: This is the list of texts on which the topic extraction should be performed. :param method: This is the name of the model which is used for the embedding step. Currently only the HDP model is enabled. :return: The method returns a JSON formatted string which includes information concerning the data points and their embeddings and uncertainties as well as the cluster topography and its dimensions. """ pipe = Pipeline([('Preprocessing', preprocessing), ('Embedding', models["HDP"]), ('EmbeddingData', Debug()),