def __init__(self, session_id):
     self.id = "brainInterface"
     self.sessions = []
     self.session_counter = -1
     self.actualSession = -1
     self.addSession(session_id)
     self.zotero = Zotero('2476068', 'user', 'ravDnfy0bMKyuDrKq5kNz5Rh')
     self.sess_folder = './sessData/' + session_id
     self.globalSess = Session('globalSess')
예제 #2
0
 def addSession(self, session_id):
     """ Function to add new sessions """
     self.session_counter = self.session_counter + 1
     self.sessions.append(Session(session_id))
     self.actualSession = self.sessions[self.session_counter]
class SageBrain(object):
    """ Sage Brain Class"""
    def __init__(self, session_id):
        self.id = "brainInterface"
        self.sessions = []
        self.session_counter = -1
        self.actualSession = -1
        self.addSession(session_id)
        self.zotero = Zotero('2476068', 'user', 'ravDnfy0bMKyuDrKq5kNz5Rh')
        self.sess_folder = './sessData/' + session_id
        self.globalSess = Session('globalSess')

    def Zotero(self, function_name, collection_key, itemKeys,
               collection_items):
        value_to_return = False
        if function_name == 'getCollections':
            value_to_return = self.zotero.getColletions()
        elif function_name == 'getCollectionItems':
            value_to_return = self.zotero.getCollectionItems(collection_key)
        elif function_name == 'downloadItems':
            value_to_return = self.zotero.downloadItems(
                itemKeys, collection_items, self.sess_folder)

        return value_to_return

    def DocInterface(self, fileName, doc_id, dataType, metadata):
        """ Brain Interface """
        if dataType == 'zoteroCollection':
            for index, each_doc in enumerate(metadata):
                doc_in_sess = self.actualSession.docInSess(
                    metadata[index]['key'])
                doc_in_global = self.globalSess.docInSess(
                    metadata[index]['key'])
                if doc_in_sess == True:
                    print("Doc in sess")
                elif doc_in_global == False:
                    print("New Document")
                    doc = Document(self.actualSession, metadata[index]['name'],
                                   metadata[index]['key'], 'zotero', "user",
                                   each_doc)
                    self.globalSess.addDoc(doc)
                    self.actualSession.addDoc(doc)
                else:
                    print("Doc in global")
                    doc_from_global = self.globalSess.returnDoc(
                        metadata[index]['key'])
                    doc = Document(self.actualSession, 'name', 'key',
                                   'inSession', "user", doc_from_global)
                    self.actualSession.addDoc(doc)
            #We get the topic and words Using Umap NSA algorithm and we include them into session
            self.actualSession.get_topics(self.actualSession.documents)
        else:
            doc = Document(self.actualSession, fileName, doc_id, "doc", "user",
                           False)
            self.actualSession.addDoc(doc)
        #Get Umap fit
        self.get_projections()
        current_data = self.send_current()
        # pdb.set_trace()
        return current_data

    def search_Arxiv(self, paperData, topicData):
        # server.actualSession.search_arxiv(paperData,topicData)

        #store using ID (How do they mach the Zotero Ones?)
        def custom_slugify(obj):
            try:
                return obj.get('id').split('/')[-1].split('.')[0] + obj.get(
                    'id').split('/')[-1].split('.')[1]
            except IndexError:
                pdb.set_trace()

        #Initialization
        number_of_papers = 0
        query = []
        #parameters
        Number_of_results = 3
        max_results_per_query = 50
        #I keep the unordered list since D3 will redraw elements in order and would reposition elements in the return if not but I need the ordered list to choose the proper topic
        paper_data_ordered_by_weight = paperData.sort_values(by=['weight'],
                                                             ascending=False)
        # topic_data_ordered_by_weight = topicData.sort_values(by=['weight'], ascending=False)
        pdb.set_trace()
        #papers queries
        print(paper_data_ordered_by_weight['title'].iloc[0])
        query.append(paper_data_ordered_by_weight['title'].iloc[0])
        # query.append(paperData['title'][1])
        # query.append(paperData['title'][2])
        #topics queries
        # query.append(' '.join([word['word'] for word in topicData['words'][0]]))
        # query.append(' '.join([word['word'] for word in topicData['words'][1]]))
        # query.append(' '.join([word['word'] for word in topicData['words'][2]]))
        for each_query in query:
            #Query Iterator
            result = arxiv.query(query=each_query,
                                 max_results=max_results_per_query,
                                 iterative=True)
            for paper in result():
                #We dont want more results than the number of results
                if number_of_papers == Number_of_results:
                    break
                else:
                    # pdb.set_trace()
                    # pdb.set_trace()
                    # #we always save in global in case for other sessions
                    # self.globalSess.addDoc(doc)
                    #Check if paper in Session first by ID
                    doc_in_sess = self.actualSession.docInSess(
                        custom_slugify(paper))
                    #Since different IDs in case I check the name too
                    title_in_sess = self.actualSession.documents['title'].isin(
                        [paper['title']]).any()
                    #Check if paper in Global first by ID
                    doc_in_global = self.globalSess.docInSess(
                        custom_slugify(paper))
                    #Since different IDs in case I check the name too
                    title_in_global = self.globalSess.documents['title'].isin(
                        [paper['title']]).any()
                    #IF paper is not in this session
                    if (doc_in_sess == False and title_in_sess == False):
                        #Check if its in Global by ID or Title if found we break
                        if (doc_in_global or title_in_global):
                            if (doc_in_global):
                                print("Document from Arxiv already in Global")
                                doc_from_global = self.globalSess.returnDoc(
                                    custom_slugify(paper))
                                doc = Document(self.actualSession, 'name',
                                               'key', 'inSession', "user",
                                               doc_from_global)
                                # self.actualSession.addDoc(doc)
                            elif (title_in_global):
                                #Check how I can get the paper from global by title if I end up here
                                pdb.set_trace()
                        #Paper is new not in GLOBAL need to download and process
                        else:
                            #Download Text
                            print('##### ' + paper['title'] + ' ######')
                            try:
                                arxiv.custom_download(
                                    paper,
                                    dirpath=self.sess_folder + '/documents/',
                                    slugify=custom_slugify)
                            except TypeError:
                                break
                            doc = Document(self, paper['title'],
                                           custom_slugify(paper), 'arxiv',
                                           "user", paper)
                            self.globalSess.addDoc(doc)
                        #If the text or topics vector or the textis smaller than the smallest processed we ignore that document and dont add it to the session.
                        if self.actualSession.topic_min_length <= len(
                                doc.topics_vector):
                            if self.actualSession.text_min_length <= len(
                                    doc.abstract_vector):
                                number_of_papers = number_of_papers + 1
                                self.actualSession.addDoc(doc)
                            else:
                                print(
                                    'Paper can not be imported topic vector smaller than smallest vector in system'
                                )
                        else:
                            print(
                                'Paper can not be imported topic vector smaller than smallest vector in system'
                            )
                    else:
                        #Paper already in session
                        print('### ' + paper['title'] +
                              ' already in Session ##')

        #We store the new papers in the session and global
        self.globalSess.storeSessData()
        self.actualSession.storeSessData()
        #We get the topic and words Using Umap NSA algorithm and we include them into session
        self.actualSession.get_topics(self.actualSession.documents)
        #Get Umap fit
        self.get_projections()
        current_data = self.send_current()
        # pdb.set_trace()
        return current_data

    def UpdateModel(self, new_paper_data):
        self.actualSession.update_model(new_paper_data)
        return self.actualSession.documents

    def addCitations(self):
        documents_msg = []
        for each_doc in self.actualSession.documents:
            print(each_doc.title)
            each_doc.GetScholarInfo()
            documents_msg.append(each_doc.create_document_msg())
        # pdb.set_trace()
        return {"documents": documents_msg}

    def get_projections(self):
        #We get the projections of the papers in the session
        self.actualSession.documents = self.actualSession.train_fit_UMAP_2D(
            self.actualSession.documents)
        self.globalSess.storeSessData()
        self.actualSession.storeSessData()

    def addSession(self, session_id):
        """ Function to add new sessions """
        self.session_counter = self.session_counter + 1
        self.sessions.append(Session(session_id))
        self.actualSession = self.sessions[self.session_counter]

    def send_current(self):
        #Assign Session Papers to Session Topics
        self.actualSession.assign_topics_to_documents()
        #We get the years
        years = self.actualSession.get_years()
        pdb.set_trace()
        return {
            "documents": self.actualSession.documents,
            "years": years,
            "authors": self.actualSession.authorList,
            "doc_topics": {
                'topics': self.actualSession.topics,
                'topic_params': self.actualSession.topic_params,
                'order': self.actualSession.topics.columns.values.tolist(),
                'words': self.actualSession.words.to_json(orient='records')
            }
        }