def post(self): global manager if manager is None: manager = doc_manager.DocManager(use_model) all_user_sent_ids_embeddings = [] for file in glob.glob('{}{}_*sentids*'.format(sentids_embeds_filedir, current_user.username)): if len(file) != 0: all_user_sent_ids_embeddings = pickle.load(open(file, 'rb')) self.question = [self.form.sentence.data] embeddings = manager.sent_to_embeddings(self.question) self.answer = manager.cos_sim(self.question, all_user_sent_ids_embeddings) coss = [i[1] for i in self.answer] sentids = [i[0] for i in self.answer] sentparadocs = [ db_models.Sentparadoc.query.filter_by(id=id).one() for id in sentids ] self.data = zip(sentparadocs, coss) return make_response( render_template('docsearch.html', form=self.form, data=self.data, documents=self.documents, question=self.question, answer=self.answer))
def post(self): global manager if manager is None: manager = doc_manager.DocManager(use_model) all_user_label_ids_embeddings = [] for file in glob.glob('{}{}_*labelids*'.format(labelids_embeds_filedir, current_user.username)): if len(file) != 0: all_user_label_ids_embeddings = pickle.load(open(file, 'rb')) self.question = [self.form.sentence.data] embeddings = manager.sent_to_embeddings(self.question) self.answer = manager.cos_sim(self.question, all_user_label_ids_embeddings, 25) coss = [i[1] for i in self.answer] labels_ids = [i[0] for i in self.answer] labels = [ db_models.Label.query.filter_by(id=id).one() for id in labels_ids ] contexts = [label.context for label in labels] # domainids = [i.domain.id for i in contexts] # domains = [db_models.Domain.query.filter_by(id = id).one() for id in contexts] sentences = zip(labels, coss) items = zip(contexts, coss) self.data = {} for item in items: if item[0] not in self.data: self.data[item[0]] = item[1] else: self.data[item[0]] += item[1] summ = sum(self.data.values()) self.data = { k: (utils.float_to_int(v / summ, 4)) for k, v in self.data.items() } self.data = sorted(self.data.items(), key=lambda x: x[1], reverse=True)[:3] return make_response( render_template('contextdistribution.html', form=self.form, data=self.data, documents=self.documents, question=self.question, answer=self.answer, sentences=sentences))
def get(self, termid): global manager if manager is None: manager = doc_manager.DocManager(use_model) term = db_models.Term.query.filter_by(id=termid).first() sentterms = db_models.Sentterm.query.filter_by(termid=term.id).all() childrenids = [ i.childid for i in db_models.Termchild.query.filter_by( parentid=term.id).all() ] children = [ db_models.Term.query.filter_by(id=id).one() for id in childrenids ] parentids = [ i.parentid for i in db_models.Termchild.query.filter_by( childid=term.id).all() ] parents = [ db_models.Term.query.filter_by(id=id).one() for id in parentids ] equivalentids = [ i.equivalentid for i in db_models.EquivalentTerm.query.filter_by( basetermid=term.id).all() ] equivalentterms = [ db_models.Term.query.filter_by(id=id).one() for id in equivalentids ] basetermids = [ i.basetermid for i in db_models.EquivalentTerm.query.filter_by( equivalentid=term.id).all() ] baseterms = [ db_models.Term.query.filter_by(id=id).one() for id in basetermids ] equivalents = list(set().union(baseterms, equivalentterms)) termdocs = {} for sentterm in sentterms: if sentterm.sentparadoc.document not in termdocs: termdocs[sentterm.sentparadoc.document] = 1 else: termdocs[sentterm.sentparadoc.document] += 1 related_terms = {} for sentterm in sentterms: for item in sentterm.sentparadoc.sentterms: if item.term not in related_terms: related_terms[item.term] = 1 else: related_terms[item.term] += 1 try: related_terms.pop(term) except: pass related_terms = sorted(related_terms.items(), key=lambda x: x[1], reverse=True) # similar terms using cosine similarity function for file in glob.glob('{}{}_*termid*'.format(termids_embeds_filedir, current_user.username)): if len(file) != 0: all_user_termids_embeddings = pickle.load(open(file, 'rb')) all_existing_termids = [i.id for i in self.terms] answer = manager.cos_sim([term.label], all_user_termids_embeddings) coss = [i[1] for i in answer] terms_ids = [i[0] for i in answer] similar_terms = [ db_models.Term.query.filter_by(id=id).first() for id in terms_ids ] similar_term_labels = list(zip(similar_terms, coss)) similar_term_labels = [ i for i in similar_term_labels if i[0] not in [p for p in parents] and i[0] not in [c for c in children] and i[0] not in [e for e in equivalents] and i[0] != term ][:4] # similar sentences using cosine similarity all_user_sent_ids_embeddings = [] for file in glob.glob('{}{}_*sentids*'.format(sentids_embeds_filedir, current_user.username)): if len(file) != 0: all_user_sent_ids_embeddings = pickle.load(open(file, 'rb')) answer = manager.cos_sim([term.label], all_user_sent_ids_embeddings) coss = [i[1] for i in answer] sentparadocs_ids = [i[0] for i in answer] sentparadocs = [ db_models.Sentparadoc.query.filter_by(id=id).one() for id in sentparadocs_ids ] related_sentparadocs = list(zip(sentparadocs, coss)) # compute term frequency in user documents: term_freq_docs = len(term.sentterms) # compute term universal frequency term_freq = None try: try: term_freq = [ 'entity', check_freq_file['e'][term.label], term_freq_docs, min( int( math.sqrt(term_freq_docs) / math.log(check_freq_file['e'][term.label]) * 100) / 100, 1) ] except: term_freq = [ 'term', check_freq_file['n'][term.label], term_freq_docs, min( int( math.sqrt(term_freq_docs) / math.log(check_freq_file['n'][term.label]) * 100) / 100, 1) ] except: term_freq = [ None, None, term_freq_docs, 'specific to user documents' ] try: is_de = check_de_file['key_class'][term.label] is_de = sorted(is_de.items(), key=lambda k: k[1], reverse=True) is_de = [(i, float(int(j * 100) / 100)) for (i, j) in is_de] except: is_de = None #compute domain relevance all_user_label_ids_embeddings = [] for file in glob.glob('{}{}_*labelids*'.format(labelids_embeds_filedir, current_user.username)): if len(file) != 0: all_user_label_ids_embeddings = pickle.load(open(file, 'rb')) # for term label, take cos similarity, then get [labelid, cossim] pair from vecvec # answer = manager.cos_sim([term.label], all_user_label_ids_embeddings) coss = [i[1] for i in answer] labels_ids = [i[0] for i in answer] labels = [ db_models.Label.query.filter_by(id=id).one() for id in labels_ids ] contexts = [label.context.name for label in labels] items = zip(contexts, coss) data = {} for item in items: if item[0] not in data: data[item[0]] = item[1] else: data[item[0]] += item[1] summ = sum(data.values()) data = {k: (utils.float_to_int(v / summ, 2)) for k, v in data.items()} data = sorted(data.items(), key=lambda x: x[1], reverse=True)[:3] return make_response( render_template('term.html', term=term, documents=self.documents, similar_term_labels=similar_term_labels, sentterms=sentterms, termdocs=termdocs, related_terms=related_terms, related_sentparadocs=related_sentparadocs, term_freq=term_freq, data=data, is_de=is_de, db_models=db_models, children=children, parents=parents, equivalents=equivalents))
def get(self): global manager if manager is None: manager = doc_manager.DocManager(use_model) all_user_label_ids_embeddings = [] for file in glob.glob('{}{}_*labelids*'.format(labelids_embeds_filedir, current_user.username)): if len(file) != 0: all_user_label_ids_embeddings = pickle.load(open(file, 'rb')) all_user_termids_embeddings = [] for file in glob.glob('{}{}_*termid*'.format(termids_embeds_filedir, current_user.username)): if len(file) != 0: all_user_termids_embeddings = pickle.load(open(file, 'rb')) terms = db_models.Term.query.all() data = [] count = 0 for term in terms: term_freq_docs = len(term.sentterms) term_freq = [] #freq, relevance try: try: term_freq = [ check_freq_file['e'][term.label], min( int( math.sqrt(term_freq_docs) / math.log( check_freq_file['e'][term.label]) * 100) / 100, 1) ] except: term_freq = [ check_freq_file['n'][term.label], min( int( math.sqrt(term_freq_docs) / math.log( check_freq_file['n'][term.label]) * 100) / 100, 1) ] except: term_freq = [None, 'User-doc-specific'] try: is_de = check_de_file['key_class'][term.label] is_de = sorted(is_de.items(), key=lambda k: k[1], reverse=True) is_de = [(i, float(int(j * 100) / 100)) for (i, j) in is_de] except: is_de = None # if len(all_user_label_ids_embeddings)>0: # for id_embed in all_user_termids_embeddings: # if id_embed[0] == term.id: # answer = manager.vec_vec_sim(id_embed[1],all_user_label_ids_embeddings) # coss = [i[1] for i in answer] # labels_ids = [i[0] for i in answer] # labels = [db_models.Label.query.filter_by(id = id).one() for id in labels_ids] # contexts = [label.context.name for label in labels] # items = zip(contexts,coss) # pair = {} # for item in items: # if item[0] not in pair: # pair[item[0]] = item[1] # else: # pair[item[0]] += item[1] # summ = sum(pair.values()) # pair = {k:(utils.float_to_int(v/summ,2)) for k,v in pair.items()} # pair = sorted(pair.items(), key = lambda x:x[1], reverse = True)[0] pair = (None, None) data.append((term, term_freq, pair, is_de)) return make_response( render_template('terms.html', documents=self.documents, data=data, db_models=db_models))
def post(self): starttime = datetime.now() global manager if manager is None: manager = doc_manager.DocManager(use_model) all_user_label_ids_embeddings = [] for file in glob.glob('{}{}_*labelids*'.format(labelids_embeds_filedir, current_user.username)): if len(file) != 0: all_user_label_ids_embeddings = pickle.load(open(file, 'rb')) self.files = self.form.files.data all_user_sent_ids_embeddings = [] for file in glob.glob('{}{}_*sentid*'.format(sentids_embeds_filedir, current_user.username)): if len(file) != 0: all_user_sent_ids_embeddings = pickle.load(open(file, 'rb')) for file in self.files: if file.filename in [doc.title for doc in self.documents]: flash( 'Document with name {} already exists!'.format( file.filename), 'info') continue self.text = utils.read_text_file(file) self.sentences = manager.doc_to_sent(self.text) self.document = db_models.Document(title=file.filename, userid=current_user.id) db_models.db.session.add(self.document) db_models.db.session.commit() for sent in self.sentences: sentparadoc = db_models.Sentparadoc(sentid=sent[1], paraid=sent[0], docid=self.document.id, senttext=sent[2]) db_models.db.session.add(sentparadoc) db_models.db.session.commit() thisdoc_sentparadoc = db_models.Sentparadoc.query.filter_by( docid=self.document.id).all() # Here we need to already have kept track of the new terms to be added to the db and their association with sentparadoc # Also adding to the db should be highly optimized by already knowing which terms exist and which do not # which means all term not existing in the db should be added to it in one go instead of individual for sent in thisdoc_sentparadoc: term_list = chunk.sent_terms(sent.senttext) for term in term_list: if term[1] in [ i.label for i in db_models.Term.query.all() ]: existing_term = db_models.Term.query.filter_by( label=term[1]).first() newsentterm = db_models.Sentterm( sentparadocid=sent.id, termid=existing_term.id) db_models.db.session.add(newsentterm) db_models.db.session.commit() else: newterm = db_models.Term( label=term[1], termtype=db_models.TermType.entity if term[0] == 'e' else (db_models.TermType.noun if term[0] == 'n' else db_models.TermType.mnp), fake=1 if len(term[1]) < 3 or len(term[1]) > 50 else 0) db_models.db.session.add(newterm) db_models.db.session.commit() sentterm = db_models.Sentterm(sentparadocid=sent.id, termid=newterm.id) db_models.db.session.add(sentterm) db_models.db.session.commit() sentids = [i.id for i in thisdoc_sentparadoc] sents = [sent[2] for sent in self.sentences] embeddings = manager.sent_to_embeddings(sents) sentids_embeddings = list(zip(sentids, embeddings)) all_user_sent_ids_embeddings.extend(sentids_embeddings) # create term label embeddings for all terms that have no embeddings all_terms = db_models.Term.query.all() all_user_termids_embeddings = [] for file in glob.glob('{}{}_*termid*'.format(termids_embeds_filedir, current_user.username)): if len(file) != 0: all_user_termids_embeddings = pickle.load(open(file, 'rb')) all_termids = [i.id for i in all_terms] existing_termids = [i[0] for i in all_user_termids_embeddings] new_terms = list(set(all_termids) - set(existing_termids)) if len(new_terms) != 0: new_term_dbobjects = db_models.Term.query.filter( db_models.Term.id.in_([i for i in new_terms])).all() new_termids = [i.id for i in new_term_dbobjects] new_termlabels = [i.label for i in new_term_dbobjects] new_term_embeddings = manager.sent_to_embeddings(new_termlabels) new_termids_embeddings = list(zip(new_termids, new_term_embeddings)) all_user_termids_embeddings.extend(new_termids_embeddings) term_outfile = open( '{}{}_{}_termids_embeds.pkl'.format( termids_embeds_filedir, current_user.username, datetime.now().strftime('%Y%m%d_%H%M%S')), 'wb') pickle.dump(all_user_termids_embeddings, term_outfile) os.chdir(termids_embeds_filedir) toberemoved = sorted(os.listdir(termids_embeds_filedir), key=os.path.getmtime) toberemoved = [ i for i in toberemoved if str(i.split('_')[0]) == current_user.username ][:-1] for file in toberemoved: os.remove(file) outfile = open( '{}{}_{}_sentids_embeds.pkl'.format( sentids_embeds_filedir, current_user.username, datetime.now().strftime('%Y%m%d_%H%M%S')), 'wb') pickle.dump(all_user_sent_ids_embeddings, outfile) os.chdir(sentids_embeds_filedir) toberemoved = sorted(os.listdir(sentids_embeds_filedir), key=os.path.getmtime) toberemoved = [ i for i in toberemoved if str(i.split('_')[0]) == current_user.username ][:-1] for file in toberemoved: os.remove(file) flash( 'Document uploaded in {} seconds'.format( (datetime.now() - starttime).seconds), 'success') return redirect(request.url)
def post(self): global manager if manager is None: manager = doc_manager.DocManager(use_model) all_user_label_ids_embeddings = [] for file in glob.glob('{}{}_*labelids*'.format(labelids_embeds_filedir, current_user.username)): if len(file) != 0: all_user_label_ids_embeddings = pickle.load(open(file, 'rb')) csvfile = self.form.file.data data = utils.read_multi_domain_context_label_csv_file(csvfile) for domain, con_labs in data.items(): if domain not in [d.name for d in self.domains]: new_domain = db_models.Domain(name=domain, userid=current_user.id) db_models.db.session.add(new_domain) db_models.db.session.commit() else: new_domain = db_models.Domain.query.filter_by( name=domain, userid=current_user.id).first() contexts = db_models.Context.query.filter_by( domainid=new_domain.id).all() for context, labels in con_labs.items(): if context not in [c.name for c in contexts]: new_context = db_models.Context(name=context, domainid=new_domain.id) db_models.db.session.add(new_context) db_models.db.session.commit() else: new_context = db_models.Context.query.filter_by( name=context, domainid=new_domain.id).first() for label in labels: new_label = db_models.Label(text=label, contextid=new_context.id) db_models.db.session.add(new_label) db_models.db.session.commit() these_labels = db_models.Label.query.filter_by(embedding=False).all() labels_ids = [i.id for i in these_labels] label_texts = [i.text for i in these_labels] embeddings = manager.sent_to_embeddings(label_texts) label_ids_embeddings = list(zip(labels_ids, embeddings)) all_user_label_ids_embeddings.extend(label_ids_embeddings) outfile = open( '{}{}_{}_labelids_embeds.pkl'.format( labelids_embeds_filedir, current_user.username, datetime.now().strftime('%Y%m%d_%H%M%S')), 'wb') pickle.dump(all_user_label_ids_embeddings, outfile) os.chdir(labelids_embeds_filedir) toberemoved = sorted(os.listdir(labelids_embeds_filedir), key=os.path.getmtime) toberemoved = [ i for i in toberemoved if str(i.split('_')[0]) == current_user.username ][:-1] for file in toberemoved: os.remove(file) return redirect(request.url)