Exemple #1
0
    def post(self):

        global manager
        if manager is None:
            manager = doc_manager.DocManager(use_model)

        all_user_sent_ids_embeddings = []
        for file in glob.glob('{}{}_*sentids*'.format(sentids_embeds_filedir,
                                                      current_user.username)):
            if len(file) != 0:
                all_user_sent_ids_embeddings = pickle.load(open(file, 'rb'))

        self.question = [self.form.sentence.data]
        embeddings = manager.sent_to_embeddings(self.question)

        self.answer = manager.cos_sim(self.question,
                                      all_user_sent_ids_embeddings)
        coss = [i[1] for i in self.answer]
        sentids = [i[0] for i in self.answer]

        sentparadocs = [
            db_models.Sentparadoc.query.filter_by(id=id).one()
            for id in sentids
        ]
        self.data = zip(sentparadocs, coss)

        return make_response(
            render_template('docsearch.html',
                            form=self.form,
                            data=self.data,
                            documents=self.documents,
                            question=self.question,
                            answer=self.answer))
Exemple #2
0
    def post(self):

        global manager
        if manager is None:
            manager = doc_manager.DocManager(use_model)

        all_user_label_ids_embeddings = []
        for file in glob.glob('{}{}_*labelids*'.format(labelids_embeds_filedir,
                                                       current_user.username)):
            if len(file) != 0:
                all_user_label_ids_embeddings = pickle.load(open(file, 'rb'))

        self.question = [self.form.sentence.data]
        embeddings = manager.sent_to_embeddings(self.question)

        self.answer = manager.cos_sim(self.question,
                                      all_user_label_ids_embeddings, 25)
        coss = [i[1] for i in self.answer]
        labels_ids = [i[0] for i in self.answer]

        labels = [
            db_models.Label.query.filter_by(id=id).one() for id in labels_ids
        ]
        contexts = [label.context for label in labels]
        # domainids = [i.domain.id for i in contexts]
        # domains = [db_models.Domain.query.filter_by(id = id).one() for id in contexts]
        sentences = zip(labels, coss)

        items = zip(contexts, coss)
        self.data = {}
        for item in items:
            if item[0] not in self.data:
                self.data[item[0]] = item[1]
            else:
                self.data[item[0]] += item[1]

        summ = sum(self.data.values())
        self.data = {
            k: (utils.float_to_int(v / summ, 4))
            for k, v in self.data.items()
        }
        self.data = sorted(self.data.items(), key=lambda x: x[1],
                           reverse=True)[:3]

        return make_response(
            render_template('contextdistribution.html',
                            form=self.form,
                            data=self.data,
                            documents=self.documents,
                            question=self.question,
                            answer=self.answer,
                            sentences=sentences))
Exemple #3
0
    def get(self, termid):
        global manager
        if manager is None:
            manager = doc_manager.DocManager(use_model)

        term = db_models.Term.query.filter_by(id=termid).first()
        sentterms = db_models.Sentterm.query.filter_by(termid=term.id).all()

        childrenids = [
            i.childid for i in db_models.Termchild.query.filter_by(
                parentid=term.id).all()
        ]
        children = [
            db_models.Term.query.filter_by(id=id).one() for id in childrenids
        ]

        parentids = [
            i.parentid for i in db_models.Termchild.query.filter_by(
                childid=term.id).all()
        ]
        parents = [
            db_models.Term.query.filter_by(id=id).one() for id in parentids
        ]

        equivalentids = [
            i.equivalentid for i in db_models.EquivalentTerm.query.filter_by(
                basetermid=term.id).all()
        ]
        equivalentterms = [
            db_models.Term.query.filter_by(id=id).one() for id in equivalentids
        ]
        basetermids = [
            i.basetermid for i in db_models.EquivalentTerm.query.filter_by(
                equivalentid=term.id).all()
        ]
        baseterms = [
            db_models.Term.query.filter_by(id=id).one() for id in basetermids
        ]

        equivalents = list(set().union(baseterms, equivalentterms))

        termdocs = {}
        for sentterm in sentterms:
            if sentterm.sentparadoc.document not in termdocs:
                termdocs[sentterm.sentparadoc.document] = 1
            else:
                termdocs[sentterm.sentparadoc.document] += 1

        related_terms = {}
        for sentterm in sentterms:
            for item in sentterm.sentparadoc.sentterms:
                if item.term not in related_terms:
                    related_terms[item.term] = 1
                else:
                    related_terms[item.term] += 1
        try:
            related_terms.pop(term)
        except:
            pass
        related_terms = sorted(related_terms.items(),
                               key=lambda x: x[1],
                               reverse=True)

        # similar terms using cosine similarity function

        for file in glob.glob('{}{}_*termid*'.format(termids_embeds_filedir,
                                                     current_user.username)):
            if len(file) != 0:
                all_user_termids_embeddings = pickle.load(open(file, 'rb'))

        all_existing_termids = [i.id for i in self.terms]
        answer = manager.cos_sim([term.label], all_user_termids_embeddings)
        coss = [i[1] for i in answer]
        terms_ids = [i[0] for i in answer]
        similar_terms = [
            db_models.Term.query.filter_by(id=id).first() for id in terms_ids
        ]
        similar_term_labels = list(zip(similar_terms, coss))
        similar_term_labels = [
            i for i in similar_term_labels if i[0] not in [p for p in parents]
            and i[0] not in [c for c in children]
            and i[0] not in [e for e in equivalents] and i[0] != term
        ][:4]

        # similar sentences using cosine similarity

        all_user_sent_ids_embeddings = []
        for file in glob.glob('{}{}_*sentids*'.format(sentids_embeds_filedir,
                                                      current_user.username)):
            if len(file) != 0:
                all_user_sent_ids_embeddings = pickle.load(open(file, 'rb'))
        answer = manager.cos_sim([term.label], all_user_sent_ids_embeddings)
        coss = [i[1] for i in answer]
        sentparadocs_ids = [i[0] for i in answer]
        sentparadocs = [
            db_models.Sentparadoc.query.filter_by(id=id).one()
            for id in sentparadocs_ids
        ]
        related_sentparadocs = list(zip(sentparadocs, coss))

        # compute term frequency in user documents:
        term_freq_docs = len(term.sentterms)
        # compute term universal frequency

        term_freq = None
        try:
            try:
                term_freq = [
                    'entity', check_freq_file['e'][term.label], term_freq_docs,
                    min(
                        int(
                            math.sqrt(term_freq_docs) /
                            math.log(check_freq_file['e'][term.label]) * 100) /
                        100, 1)
                ]
            except:
                term_freq = [
                    'term', check_freq_file['n'][term.label], term_freq_docs,
                    min(
                        int(
                            math.sqrt(term_freq_docs) /
                            math.log(check_freq_file['n'][term.label]) * 100) /
                        100, 1)
                ]
        except:
            term_freq = [
                None, None, term_freq_docs, 'specific to user documents'
            ]

        try:
            is_de = check_de_file['key_class'][term.label]
            is_de = sorted(is_de.items(), key=lambda k: k[1], reverse=True)
            is_de = [(i, float(int(j * 100) / 100)) for (i, j) in is_de]

        except:
            is_de = None

        #compute domain relevance

        all_user_label_ids_embeddings = []
        for file in glob.glob('{}{}_*labelids*'.format(labelids_embeds_filedir,
                                                       current_user.username)):
            if len(file) != 0:
                all_user_label_ids_embeddings = pickle.load(open(file, 'rb'))

        # for term label, take cos similarity, then get [labelid, cossim] pair from vecvec
        #

        answer = manager.cos_sim([term.label], all_user_label_ids_embeddings)
        coss = [i[1] for i in answer]
        labels_ids = [i[0] for i in answer]
        labels = [
            db_models.Label.query.filter_by(id=id).one() for id in labels_ids
        ]
        contexts = [label.context.name for label in labels]

        items = zip(contexts, coss)
        data = {}
        for item in items:
            if item[0] not in data:
                data[item[0]] = item[1]
            else:
                data[item[0]] += item[1]

        summ = sum(data.values())
        data = {k: (utils.float_to_int(v / summ, 2)) for k, v in data.items()}
        data = sorted(data.items(), key=lambda x: x[1], reverse=True)[:3]

        return make_response(
            render_template('term.html',
                            term=term,
                            documents=self.documents,
                            similar_term_labels=similar_term_labels,
                            sentterms=sentterms,
                            termdocs=termdocs,
                            related_terms=related_terms,
                            related_sentparadocs=related_sentparadocs,
                            term_freq=term_freq,
                            data=data,
                            is_de=is_de,
                            db_models=db_models,
                            children=children,
                            parents=parents,
                            equivalents=equivalents))
Exemple #4
0
    def get(self):

        global manager
        if manager is None:
            manager = doc_manager.DocManager(use_model)

        all_user_label_ids_embeddings = []
        for file in glob.glob('{}{}_*labelids*'.format(labelids_embeds_filedir,
                                                       current_user.username)):
            if len(file) != 0:
                all_user_label_ids_embeddings = pickle.load(open(file, 'rb'))

        all_user_termids_embeddings = []
        for file in glob.glob('{}{}_*termid*'.format(termids_embeds_filedir,
                                                     current_user.username)):
            if len(file) != 0:
                all_user_termids_embeddings = pickle.load(open(file, 'rb'))

        terms = db_models.Term.query.all()
        data = []
        count = 0
        for term in terms:
            term_freq_docs = len(term.sentterms)
            term_freq = []  #freq, relevance
            try:
                try:
                    term_freq = [
                        check_freq_file['e'][term.label],
                        min(
                            int(
                                math.sqrt(term_freq_docs) / math.log(
                                    check_freq_file['e'][term.label]) * 100) /
                            100, 1)
                    ]
                except:
                    term_freq = [
                        check_freq_file['n'][term.label],
                        min(
                            int(
                                math.sqrt(term_freq_docs) / math.log(
                                    check_freq_file['n'][term.label]) * 100) /
                            100, 1)
                    ]
            except:
                term_freq = [None, 'User-doc-specific']

            try:
                is_de = check_de_file['key_class'][term.label]
                is_de = sorted(is_de.items(), key=lambda k: k[1], reverse=True)
                is_de = [(i, float(int(j * 100) / 100)) for (i, j) in is_de]

            except:
                is_de = None

            # if len(all_user_label_ids_embeddings)>0:
            #     for id_embed in all_user_termids_embeddings:
            #         if id_embed[0] == term.id:
            #             answer = manager.vec_vec_sim(id_embed[1],all_user_label_ids_embeddings)
            #             coss = [i[1] for i in answer]
            #             labels_ids = [i[0] for i in answer]
            #             labels = [db_models.Label.query.filter_by(id = id).one() for id in labels_ids]
            #             contexts = [label.context.name for label in labels]

            #             items = zip(contexts,coss)
            #             pair = {}
            #             for item in items:
            #                 if item[0] not in pair:
            #                     pair[item[0]] = item[1]
            #                 else:
            #                     pair[item[0]] += item[1]

            #             summ = sum(pair.values())
            #             pair = {k:(utils.float_to_int(v/summ,2)) for k,v in pair.items()}
            #             pair = sorted(pair.items(), key = lambda x:x[1], reverse = True)[0]
            pair = (None, None)

            data.append((term, term_freq, pair, is_de))

        return make_response(
            render_template('terms.html',
                            documents=self.documents,
                            data=data,
                            db_models=db_models))
Exemple #5
0
    def post(self):
        starttime = datetime.now()
        global manager
        if manager is None:
            manager = doc_manager.DocManager(use_model)

        all_user_label_ids_embeddings = []
        for file in glob.glob('{}{}_*labelids*'.format(labelids_embeds_filedir,
                                                       current_user.username)):
            if len(file) != 0:
                all_user_label_ids_embeddings = pickle.load(open(file, 'rb'))

        self.files = self.form.files.data

        all_user_sent_ids_embeddings = []
        for file in glob.glob('{}{}_*sentid*'.format(sentids_embeds_filedir,
                                                     current_user.username)):
            if len(file) != 0:
                all_user_sent_ids_embeddings = pickle.load(open(file, 'rb'))

        for file in self.files:
            if file.filename in [doc.title for doc in self.documents]:
                flash(
                    'Document with name {} already exists!'.format(
                        file.filename), 'info')
                continue
            self.text = utils.read_text_file(file)
            self.sentences = manager.doc_to_sent(self.text)

            self.document = db_models.Document(title=file.filename,
                                               userid=current_user.id)
            db_models.db.session.add(self.document)
            db_models.db.session.commit()

            for sent in self.sentences:
                sentparadoc = db_models.Sentparadoc(sentid=sent[1],
                                                    paraid=sent[0],
                                                    docid=self.document.id,
                                                    senttext=sent[2])
                db_models.db.session.add(sentparadoc)
            db_models.db.session.commit()

            thisdoc_sentparadoc = db_models.Sentparadoc.query.filter_by(
                docid=self.document.id).all()

            # Here we need to already have kept track of the new terms to be added to the db and their association with sentparadoc
            # Also adding to the db should be highly optimized by already knowing which terms exist and which do not
            # which means all term not existing in the db should be added to it in one go instead of individual
            for sent in thisdoc_sentparadoc:
                term_list = chunk.sent_terms(sent.senttext)
                for term in term_list:
                    if term[1] in [
                            i.label for i in db_models.Term.query.all()
                    ]:
                        existing_term = db_models.Term.query.filter_by(
                            label=term[1]).first()
                        newsentterm = db_models.Sentterm(
                            sentparadocid=sent.id, termid=existing_term.id)
                        db_models.db.session.add(newsentterm)
                        db_models.db.session.commit()
                    else:
                        newterm = db_models.Term(
                            label=term[1],
                            termtype=db_models.TermType.entity
                            if term[0] == 'e' else
                            (db_models.TermType.noun
                             if term[0] == 'n' else db_models.TermType.mnp),
                            fake=1
                            if len(term[1]) < 3 or len(term[1]) > 50 else 0)
                        db_models.db.session.add(newterm)
                        db_models.db.session.commit()
                        sentterm = db_models.Sentterm(sentparadocid=sent.id,
                                                      termid=newterm.id)
                        db_models.db.session.add(sentterm)
                        db_models.db.session.commit()

            sentids = [i.id for i in thisdoc_sentparadoc]
            sents = [sent[2] for sent in self.sentences]
            embeddings = manager.sent_to_embeddings(sents)

            sentids_embeddings = list(zip(sentids, embeddings))
            all_user_sent_ids_embeddings.extend(sentids_embeddings)

        # create term label embeddings for all terms that have no embeddings
        all_terms = db_models.Term.query.all()

        all_user_termids_embeddings = []
        for file in glob.glob('{}{}_*termid*'.format(termids_embeds_filedir,
                                                     current_user.username)):
            if len(file) != 0:
                all_user_termids_embeddings = pickle.load(open(file, 'rb'))

        all_termids = [i.id for i in all_terms]
        existing_termids = [i[0] for i in all_user_termids_embeddings]
        new_terms = list(set(all_termids) - set(existing_termids))
        if len(new_terms) != 0:
            new_term_dbobjects = db_models.Term.query.filter(
                db_models.Term.id.in_([i for i in new_terms])).all()
            new_termids = [i.id for i in new_term_dbobjects]

            new_termlabels = [i.label for i in new_term_dbobjects]
            new_term_embeddings = manager.sent_to_embeddings(new_termlabels)
            new_termids_embeddings = list(zip(new_termids,
                                              new_term_embeddings))
            all_user_termids_embeddings.extend(new_termids_embeddings)

            term_outfile = open(
                '{}{}_{}_termids_embeds.pkl'.format(
                    termids_embeds_filedir, current_user.username,
                    datetime.now().strftime('%Y%m%d_%H%M%S')), 'wb')
            pickle.dump(all_user_termids_embeddings, term_outfile)
            os.chdir(termids_embeds_filedir)
            toberemoved = sorted(os.listdir(termids_embeds_filedir),
                                 key=os.path.getmtime)
            toberemoved = [
                i for i in toberemoved
                if str(i.split('_')[0]) == current_user.username
            ][:-1]
            for file in toberemoved:
                os.remove(file)

        outfile = open(
            '{}{}_{}_sentids_embeds.pkl'.format(
                sentids_embeds_filedir, current_user.username,
                datetime.now().strftime('%Y%m%d_%H%M%S')), 'wb')
        pickle.dump(all_user_sent_ids_embeddings, outfile)
        os.chdir(sentids_embeds_filedir)
        toberemoved = sorted(os.listdir(sentids_embeds_filedir),
                             key=os.path.getmtime)
        toberemoved = [
            i for i in toberemoved
            if str(i.split('_')[0]) == current_user.username
        ][:-1]
        for file in toberemoved:
            os.remove(file)
        flash(
            'Document uploaded in {} seconds'.format(
                (datetime.now() - starttime).seconds), 'success')
        return redirect(request.url)
Exemple #6
0
    def post(self):

        global manager
        if manager is None:
            manager = doc_manager.DocManager(use_model)

        all_user_label_ids_embeddings = []
        for file in glob.glob('{}{}_*labelids*'.format(labelids_embeds_filedir,
                                                       current_user.username)):
            if len(file) != 0:
                all_user_label_ids_embeddings = pickle.load(open(file, 'rb'))

        csvfile = self.form.file.data
        data = utils.read_multi_domain_context_label_csv_file(csvfile)
        for domain, con_labs in data.items():
            if domain not in [d.name for d in self.domains]:
                new_domain = db_models.Domain(name=domain,
                                              userid=current_user.id)
                db_models.db.session.add(new_domain)
                db_models.db.session.commit()
            else:
                new_domain = db_models.Domain.query.filter_by(
                    name=domain, userid=current_user.id).first()
            contexts = db_models.Context.query.filter_by(
                domainid=new_domain.id).all()
            for context, labels in con_labs.items():
                if context not in [c.name for c in contexts]:
                    new_context = db_models.Context(name=context,
                                                    domainid=new_domain.id)
                    db_models.db.session.add(new_context)
                    db_models.db.session.commit()
                else:
                    new_context = db_models.Context.query.filter_by(
                        name=context, domainid=new_domain.id).first()
                for label in labels:
                    new_label = db_models.Label(text=label,
                                                contextid=new_context.id)
                    db_models.db.session.add(new_label)
                db_models.db.session.commit()

        these_labels = db_models.Label.query.filter_by(embedding=False).all()
        labels_ids = [i.id for i in these_labels]
        label_texts = [i.text for i in these_labels]
        embeddings = manager.sent_to_embeddings(label_texts)
        label_ids_embeddings = list(zip(labels_ids, embeddings))

        all_user_label_ids_embeddings.extend(label_ids_embeddings)

        outfile = open(
            '{}{}_{}_labelids_embeds.pkl'.format(
                labelids_embeds_filedir, current_user.username,
                datetime.now().strftime('%Y%m%d_%H%M%S')), 'wb')
        pickle.dump(all_user_label_ids_embeddings, outfile)
        os.chdir(labelids_embeds_filedir)
        toberemoved = sorted(os.listdir(labelids_embeds_filedir),
                             key=os.path.getmtime)
        toberemoved = [
            i for i in toberemoved
            if str(i.split('_')[0]) == current_user.username
        ][:-1]
        for file in toberemoved:
            os.remove(file)

        return redirect(request.url)