예제 #1
0
   def run(self):
      # Init the db object
      db = DB(self.dbHost, self.dbDatabase, self.dbTable, self.dbUser, self.dbPass)

      # Connect to the remote database server
      loginStatus = db.connect()

      self.emit(SIGNAL("postLogin(PyQt_PyObject, PyQt_PyObject)"), loginStatus, db)
예제 #2
0
    def run(self):
        # Init the db object
        db = DB(self.dbHost, self.dbDatabase, self.dbTable, self.dbUser, self.dbPass)

        # Connect to the remote database server
        loginStatus = db.connect()

        self.postLoginSignal.emit(loginStatus, db)
 def __init__(self, vec_file, pap, pat, pro):
     # self.wm = gensim.models.KeyedVectors.load_word2vec_format(vec_file,binary=True)
     self.wm = gensim.models.word2vec.Word2Vec.load_word2vec_format(
         vec_file, binary=True)
     self.paper_index = AnnoyIndexer()
     self.paper_index.load(pap)
     self.patent_index = AnnoyIndexer()
     self.patent_index.load(pat)
     self.project_index = AnnoyIndexer()
     self.project_index.load(pro)
     self.t2v = Convert2Vec(self.wm)
     self.cuttor = FilterCut()
     self.db = DB()
     self.featureIndex = self.buildFeatureIndex()
예제 #4
0
   def start(self):
      # Compile the regex for pulling the card ID from all the data on a card
      # Do this here so it isn't done multiple times in the functions below
      self.regex = re.compile(";(.+)=")

      try:
         while 1:
            # Get DB info
            self.getDbInfo()

            # Create the DB object
            self.db = DB(self.dbHost, c.DEFAULT_DATABASE, self.dbTable, self.dbUser, self.dbPass)

            # Connect to the database
            connectStatus = self.connectToDatabase()

            # If we failed to connect to the database offer to re-enter db info
            if connectStatus != c.SUCCESS:
               reenter = raw_input("Failed to connect to database. Re-enter database info? (Y,n) ")
               if reenter.lower() == "n":
                  print "Bye."
                  sys.exit(0)
            else:
               break

         # Start the main menu loop
         self.displayMenu()

      except KeyboardInterrupt:
         pass
      finally:
         print "Cleaning up and exiting..."
         if self.db is not None:
            self.db.close()
예제 #5
0
class TextUI:
   def __init__(self):
      self.db = None


   def start(self):
      # Compile the regex for pulling the card ID from all the data on a card
      # Do this here so it isn't done multiple times in the functions below
      self.regex = re.compile(";(.+)=")

      try:
         while 1:
            # Get DB info
            self.getDbInfo()

            # Create the DB object
            self.db = DB(self.dbHost, c.DEFAULT_DATABASE, self.dbTable, self.dbUser, self.dbPass)

            # Connect to the database
            connectStatus = self.connectToDatabase()

            # If we failed to connect to the database offer to re-enter db info
            if connectStatus != c.SUCCESS:
               reenter = raw_input("Failed to connect to database. Re-enter database info? (Y,n) ")
               if reenter.lower() == "n":
                  print "Bye."
                  sys.exit(0)
            else:
               break

         # Start the main menu loop
         self.displayMenu()

      except KeyboardInterrupt:
         pass
      finally:
         print "Cleaning up and exiting..."
         if self.db is not None:
            self.db.close()


   def displayMenu(self):
      print "\nType \"back\" at any time to go up a menu level."

      while 1:
         # Display main menu
         print "\n\t1.) Check-in\n\t2.) Show Points\n\t3.) Exit"
         try:
            option = raw_input("\n>> ")

            if option == "1":
               self.checkin()
            elif option == "2":
               self.showPoints()
            elif option == "3":
               sys.exit(0)
            elif option == "back" or option == "exit":
               exit = raw_input("Exit? (y,N) ")
               if exit.lower() == "y":
                  sys.exit(0)
            else:
               self.invalidInput()

         except ValueError:
            self.invalidInput()


   def connectToDatabase(self):
      # Use stdout.write to prevent newline
      sys.stdout.write("Connecting to database...")

      # Connect to the DB!
      status = self.db.connect()

      if status == c.SUCCESS:
         print "done."
         return status
      elif status == c.BAD_PASSWD:
         print "\nError connecting to database: Bad username or password."
         return status
      else:
         print "\nUnknown Error connecting to database."
         return c.FAILURE


   def checkin(self):
      # Get and validate the point value for this check-in
      # Limited to 500 points to prevent bad typos
      while 1:
         pointValue = SharedUtils.sanitizeInput(raw_input("\nPoint Value (" + str(c.DEFAULT_POINTS) + "): "))
         
         # Validate point input
         if pointValue == "":
            pointValue = str(c.DEFAULT_POINTS)
            break
         elif (pointValue.isdigit() and int(pointValue) <= 500) or pointValue == "back":
            break
         else:
            print "Invalid input. Try again."

      while 1:
         cardID = self.getCardSwipe()

         # If the user requested to exit the loop, break
         if cardID == c.BACK:
            break
         elif cardID == c.ERROR_READING_CARD:
            print "Error reading card. Swipe card again."
            continue

         # Sanitize cardID
         cardID = SharedUtils.sanitizeInput(cardID)
         # cardID will be empty if it failed sanitization. Skip checkin if that is the case
         if cardID == "":
            continue
         
         # Do the checkin
         checkinResult = self.db.checkin(cardID, pointValue)

         if checkinResult["checkinStatus"] == c.SQL_ERROR:
            self.showDatabaseError(checkinResult["sqlError"])
         elif checkinResult["checkinStatus"] == c.BAD_CHECKIN_TIME:
            print "Error: You may only check-in once per hour."
         elif checkinResult["checkinStatus"] == c.FUTURE_CHECKIN_TIME:
            print "Error: Previous check-in time was in the future. Check your local system time."
         elif checkinResult["checkinStatus"] == c.CARD_NOT_IN_DB:
            # Ask if user wants to add the card
            addCard = raw_input("Error: Card not found in database. Add it now? (Y,n) ")
            
            if addCard == "n":
               continue
            
            # Get the accessID for the new card
            accessID = SharedUtils.sanitizeInput(raw_input("Access ID: "))

            # Add the card
            addCardResult = self.db.addCard(cardID, accessID, pointValue)

            if addCardResult["addCardStatus"] == c.SUCCESS:
               self.showCheckinConfirmation(accessID, pointValue)
            elif addCardResult["addCardStatus"] == c.SQL_ERROR:
               self.showDatabaseError(addCardResult["sqlError"])

         elif checkinResult["checkinStatus"] == c.SUCCESS:
            self.showCheckinConfirmation(checkinResult["accessID"], pointValue)
         else:
            print "Unknown error checking in."


   def showPoints(self):
      accessID = SharedUtils.sanitizeInput(raw_input("\nAccess ID (blank for all): "))
      showPointsResult = self.db.showPoints(accessID)

      if showPointsResult["showPointsStatus"] == c.SQL_ERROR:
         self.showDatabaseError(showPointsResult["sqlError"])
      elif showPointsResult["showPointsStatus"] == c.NO_RESULTS:
         print "\nThere were no results to that query."
      elif showPointsResult["showPointsStatus"] == c.SUCCESS:
         # If showing all users, display a pretty table
         if accessID == "":
            print "\n+--------------------+\n| Access ID | Points |\n+--------------------+"

            for i in range(len(showPointsResult["pointsTuple"])):
               print "|%10s | %6s |" % (showPointsResult["pointsTuple"][i][0], showPointsResult["pointsTuple"][i][1])
            print "+--------------------+"
         
         # Show a single user's points
         else:
            print "\n%s has %s points." % (accessID, str(showPointsResult["pointsTuple"][0][0]))


   def getCardSwipe(self):
      # Read the card data as a password so it doesn't show on the screen
      cardID = SharedUtils.sanitizeInput(getpass.getpass("\nWaiting for card swipe..."))
      try:
         # Return the card ID
        return self.regex.search(cardID).group(1)
      except AttributeError:
         # If exit or back, just return to go back
         if "exit" in cardID or "back" in cardID:
            return c.BACK
         # Else, a match wasn't found which probably means there was
         # and error reading the card or the card isn't a PSU ID card
         # but assume the former
         else:
            return c.ERROR_READING_CARD


   def getDbInfo(self):
      self.dbHost = raw_input("Database host: (" + c.DEFAULT_HOST + ") ")

      if self.dbHost == "":
         self.dbHost = c.DEFAULT_HOST

      self.dbTable = raw_input("Database table: (" + c.DEFAULT_TABLE + ") ")

      if self.dbTable == "":
         self.dbTable = c.DEFAULT_TABLE

      self.dbUser = raw_input("Database Username: (" + c.DEFAULT_USER + ") ")

      if self.dbUser == "":
         self.dbUser = c.DEFAULT_USER

      while 1:
         self.dbPass = getpass.getpass("Database Password: "******"":
            print "Database password cannot be blank."
         else:
            break


   def showCheckinConfirmation(self, accessID, pointValue):
      print "%s +%s points" % (accessID, pointValue)


   def showDatabaseError(self, error):
      print "\nWARNING! Database error:\n%s" % (error.args[1])


   def invalidInput(self):
      print "Invalid option. Try again."
예제 #6
0
 def get_author_by_sql(self, typee, ids):
     db = DB()
     return db.getAuthors(typee, ids)  # 使用MySQL获取信息
예제 #7
0
    def expertDocsSort(self, expertId, txt, topN):
        vec = self.t2v.text2v(txt, self.cuttor)
        annoy = AnnoyIndex(200)
        count = 0
        annoy.add_item(count, vec)
        count = count + 1
        db = DB()
        papers = db.getPapers(expertId)
        for p in papers:
            p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor)
            if p[3] is not None:
                annoy.add_item(count, p[3])
                p[3] = annoy.get_distance(0, count)
                count = count + 1
        papers = sorted(papers, key=lambda p: p[3])
        papersFormated = []
        for p in papers:
            if len(papersFormated) == topN:
                break
            map = {}
            if p[0] is not None:
                map['paperId'] = p[0].encode('utf8')
            else:
                map['paperId'] = p[0]
            if p[1] is not None:
                map['name'] = p[1].encode('utf8')
            else:
                map['name'] = p[1]
            if p[4] is not None:
                map['authors'] = p[4].encode('utf8')
            else:
                map['authors'] = p[4]
            if p[5] is not None:
                map['journalName'] = p[5].encode('utf8')
            else:
                map['journalName'] = p[5]
            if p[6] is not None:
                map['year'] = p[6].encode('utf8')
            else:
                map['year'] = p[6]
            papersFormated.append(map)

        count = 0
        annoy.unload()
        annoy.add_item(count, vec)
        count = count + 1
        patents = db.getPatents(expertId)
        for p in patents:
            p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor)
            if p[3] is not None:
                annoy.add_item(count, p[3])
                p[3] = annoy.get_distance(0, count)
                count = count + 1
        patents = sorted(patents, key=lambda p: p[3])
        patentsFormated = []
        for p in patents:
            if len(patentsFormated) == topN:
                break
            map = {}
            if p[0] is not None:
                map['patentId'] = p[0].encode('utf8')
            else:
                map['patentId'] = p[0]
            if p[4] is not None:
                map['publicationNo'] = p[4].encode('utf8')
            else:
                map['publicationNo'] = p[4]
            if p[1] is not None:
                map['name'] = p[1].encode('utf8')
            else:
                map['name'] = p[1]
            if p[5] is not None:
                map['inventors'] = p[5].encode('utf8')
            else:
                map['inventors'] = p[5]
            if p[6] is not None:
                map['applicant'] = p[6].encode('utf8')
            else:
                map['applicant'] = p[6]
            if p[7] is not None:
                map['year'] = p[7].encode('utf8')
            else:
                map['year'] = p[7]
            patentsFormated.append(map)

        count = 0
        annoy.unload()
        annoy.add_item(count, vec)
        count = count + 1
        projects = db.getProjects(expertId)
        for p in projects:
            p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor)
            if p[3] is not None:
                annoy.add_item(count, p[3])
                p[3] = annoy.get_distance(0, count)
                count = count + 1
        projects = sorted(projects, key=lambda p: p[3])
        projectsFormated = []
        for p in projects:
            if len(projectsFormated) == topN:
                break
            map = {}
            if p[0] is not None:
                map['projectId'] = p[0].encode('utf8')
            else:
                map['projectId'] = p[0]
            if p[1] is not None:
                map['name'] = p[1].encode('utf8')
            else:
                map['name'] = p[1]
            if p[4] is not None:
                map['member'] = p[4].encode('utf8')
            else:
                map['member'] = p[4]
            if p[5] is not None:
                map['unit'] = p[5].encode('utf8')
            else:
                map['unit'] = p[5]
            if p[6] is not None:
                map['year'] = p[6].encode('utf8')
            else:
                map['year'] = p[6]
            if p[7] is not None:
                map['type'] = p[7].encode('utf8')
            else:
                map['type'] = p[7]
            projectsFormated.append(map)
        result = {}
        result['papers'] = papersFormated
        result['patents'] = patentsFormated
        result['projects'] = projectsFormated
        return result
class Recommander(object):
    def __init__(self, vec_file, pap, pat, pro):
        # self.wm = gensim.models.KeyedVectors.load_word2vec_format(vec_file,binary=True)
        self.wm = gensim.models.word2vec.Word2Vec.load_word2vec_format(
            vec_file, binary=True)
        self.paper_index = AnnoyIndexer()
        self.paper_index.load(pap)
        self.patent_index = AnnoyIndexer()
        self.patent_index.load(pat)
        self.project_index = AnnoyIndexer()
        self.project_index.load(pro)
        self.t2v = Convert2Vec(self.wm)
        self.cuttor = FilterCut()
        self.db = DB()
        self.featureIndex = self.buildFeatureIndex()

    def buildFeatureIndex(self):
        paperFeature = open(
            "/testdata400/data/recommender/data0828/feature/paper_feature.txt",
            'r')
        patentFeature = open(
            "/testdata400/data/recommender/data0828/feature/patent_feature.txt",
            'r')
        projectFeature = open(
            "/testdata400/data/recommender/data0828/feature/project_feature.txt",
            'r')
        featureIndex = {}
        featureIndex['paper'] = self.loadFeature(paperFeature)
        featureIndex['patent'] = self.loadFeature(patentFeature)
        featureIndex['project'] = self.loadFeature(projectFeature)
        return featureIndex

    def loadFeature(self, file):
        file = file.readlines()
        index = {}
        index['field'] = {}
        index['type'] = {}
        index['province'] = {}
        index['unit'] = {}
        for line in file:
            feature = line.split('\t')
            if feature[1] not in index['field']:
                index['field'][feature[1]] = []
            index['field'][feature[1]].append(feature[0])
            if feature[2] not in index['type']:
                index['type'][feature[2]] = []
            index['type'][feature[2]].append(feature[0])
            if feature[3] not in index['province']:
                index['province'][feature[3]] = []
            index['province'][feature[3]].append(feature[0])
            if feature[4] not in index['unit']:
                index['unit'][feature[4]] = []
            index['unit'][feature[4]].append(feature[0])
        return index

    # 过滤论文,项目,专利
    def filter(self, typee, topDocs, filterParams, topN):
        topDocIds = [i for i, j in topDocs]
        if not (filterParams[0] == '' or filterParams[0] == '-1' or typee
                == 'project'):  # field, 项目没有type,不用过滤,参数为空字符串或者-1表示不过滤
            if filterParams[0] not in self.featureIndex[typee]['field']:
                topDocIds = []
            topDocIds = list(
                set(topDocIds).intersection(
                    self.featureIndex[typee]['field'][filterParams[0]]))
        if not (filterParams[1] == '' or filterParams[1] == '-1'):  # type
            if filterParams[1] not in self.featureIndex[typee]['type']:
                topDocIds = []
            topDocIds = list(
                set(topDocIds).intersection(
                    self.featureIndex[typee]['type'][filterParams[1]]))
        if not (filterParams[2] == '' or filterParams[2] == '-1'):  # province
            if filterParams[2] not in self.featureIndex[typee]['province']:
                topDocIds = []
            topDocIds = list(
                set(topDocIds).intersection(
                    self.featureIndex[typee]['province'][filterParams[2]]))
        if not (filterParams[3] == '' or filterParams[3] == '-1'):  # unit
            if filterParams[3] not in self.featureIndex[typee]['unit']:
                topDocIds = []
            topDocIds = list(
                set(topDocIds).intersection(
                    self.featureIndex[typee]['unit'][filterParams[3]]))
        result = []
        for i in topDocs:
            if i[0] in topDocIds:
                result.append(i)
            if len(result) == topN:
                break
        return result

    # 不过滤地区,且返回全部满足的文档,而不仅仅是topn个文档
    # def filterForExpert(self, typee, topDocs, filterParams):
    #     topDocIds = [i for i,j in topDocs]
    #     if not (filterParams[0] == '' or filterParams[
    #         0] == '-1' or typee == 'project'):  # field, 项目没有type,不用过滤,参数为空字符串或者-1表示不过滤
    #         if filterParams[0] not in self.featureIndex[typee]['field']:
    #             topDocIds = []
    #         topDocIds = list(set(topDocIds).intersection(self.featureIndex[typee]['field'][filterParams[0]]))
    #     if not (filterParams[1] == '' or filterParams[1] == '-1'):  # type
    #         if filterParams[1] not in self.featureIndex[typee]['type']:
    #             topDocIds = []
    #         topDocIds = list(set(topDocIds).intersection(self.featureIndex[typee]['type'][filterParams[1]]))
    #     if not (filterParams[3] == '' or filterParams[3] == '-1'):  # unit
    #         if filterParams[3] not in self.featureIndex[typee]['unit']:
    #             topDocIds = []
    #         topDocIds = list(set(topDocIds).intersection(self.featureIndex[typee]['unit'][filterParams[3]]))
    #     result = []
    #
    #     topDocsMap = {}
    #     for i in range(len(topDocs)):
    #         topDocsMap[topDocs[i][0]]=topDocs[i][1]
    #     for id in topDocIds:
    #         listTemp = [id,topDocsMap[id]]
    #         result.append(listTemp)
    #     return result

    def most_similar_paper(self, text, topn=10):
        vec = self.t2v.text2v(text, self.cuttor)
        return self.paper_index.most_similar(vec, topn)

    def most_similar_patent(self, text, topn=10):
        vec = self.t2v.text2v(text, self.cuttor)
        return self.patent_index.most_similar(vec, topn)

    def most_similar_project(self, text, topn=10):
        vec = self.t2v.text2v(text, self.cuttor)
        return self.project_index.most_similar(vec, topn)

    def getSimExpertsIds(self, topDocs):
        expertInfoOut = {}
        expertMap = {}
        authorSeqWeiht = [1.0, 0.85, 0.7, 0.5]
        for typee in topDocs:
            order = {}
            order[typee] = {}
            k = 0
            for i, j in topDocs[typee]:
                order[typee][i] = k
                k = k + 1
            ids = [i for i, j in topDocs[typee]]
            docExpertIds = self.db.getAuthors(typee, ids)
            for id in docExpertIds:
                if not self.db.idInDB(typee, id):
                    print "docId:" + id + "is not in db"
                    continue
                expertIds = docExpertIds[id]
                qs = 1.0
                sim = qs
                for i, j in topDocs[typee]:
                    if i == id:
                        sim = j * sim
                        break
                for i in range(len(expertIds)):
                    if i >= 4:  # 一个成果考虑4个作者
                        break
                    if expertIds[i] not in expertInfoOut:
                        expertInfoOut[expertIds[i]] = []
                    expertInfoOut[expertIds[i]].append([
                        typee + str(order[typee][id]), sim * authorSeqWeiht[i],
                        i
                    ])
                    if expertIds[i] not in expertMap:
                        expertMap[expertIds[i]] = []
                    expertMap[expertIds[i]].append(sim * authorSeqWeiht[i])
        return expertMap, expertInfoOut

    # 从成果提取专家,有些专家在不过滤省份时排在前,但过滤省份后排在后,为避免此情况,先不过滤成果的地区,
    # 从这些不过滤地区的成果中提取专家,再按地区过滤专家,若不足topN,再在过滤地区的成果中找剩余的专家
    #
    # 这个函数需要重构,但是八成需求会改,所以先不重构了
    def most_similar_expert(self, topPapers, topPatents, topProjects,
                            filterParams, expertTopN):
        file = open("config.ini", 'r')
        config = ConfigParser.ConfigParser()
        config.readfp(file)
        LEN = int(config.get('global', 'len'))  # 对于一个专家要计算多少他的成果
        COE = float(config.get('global', 'coe'))  # 对于一个专家,从第二个的成果相似度乘的系数
        topDocs = {}
        topDocs['paper'] = self.filter('paper', topPapers, filterParams, 50)
        topDocs['patent'] = self.filter('patent', topPatents, filterParams, 50)
        topDocs['project'] = self.filter('project', topProjects, filterParams,
                                         15)
        expertMap, expertInfoOut = self.getSimExpertsIds(
            topDocs)  # 专家id为key,各项成果的相似度list为value
        expertScoreMap = {}  # 专家为key,评分为value
        for expert in expertMap:
            expertMap[expert].sort(reverse=True)
            sim = expertMap[expert][0]
            for i in range(1, len(expertMap[expert])):
                if i >= LEN:
                    break
                sim = sim + COE * expertMap[expert][i]
            expertScoreMap[expert] = sim
        result = sorted(expertScoreMap.items(),
                        key=lambda item: item[1],
                        reverse=True)[0:expertTopN]
        out = []
        for i in result:
            if i[0] in expertInfoOut:
                out.append({i[0]: expertInfoOut[i[0]]})
                # out[i[0]]=expertInfoOut[i[0]]
        self.printOut(out, LEN)
        return result

    def printOut(self, out, l):
        name = str('log/' + time.strftime("%Y-%m-%d %H-%M-%S" +
                                          ".txt", time.localtime()))
        print name
        output = open(name, 'w')
        for expert in out:
            for i in expert:
                list = expert[i]
                expert[i] = sorted(list, key=lambda doc: doc[1],
                                   reverse=True)[0:l]
        for expert in out:
            for i in expert:
                # print i  # 作者id
                output.write(i + '\n')
                list = expert[i]  # list为doc信息
                docOrder = ''
                for j in list:
                    docOrder = docOrder + j[0] + '                  '
                # print docOrder
                output.write(docOrder + '\n')
                sim = ''
                for j in list:
                    sim = sim + str(j[1]) + '             '
                # print sim
                output.write(sim + '\n')
                expertOrder = ''
                for j in list:
                    expertOrder = expertOrder + str(
                        j[2]) + '                            '
                # print expertOrder
                output.write(expertOrder + '\n')
                output.write("\n")
        output.close()

    # def most_similar_expert(self, text, topDocs):
    #     expertMap = self.getSimExpertsIds(topDocs)  # 专家id为key,各项成果的相似度list为value
    #     expertScoreMap = {}  # 专家为key,评分为value
    #     for expert in expertMap:
    #         expertMap[expert].sort(reverse=True)
    #         sim = expertMap[expert][0]
    #         for i in range(1, len(expertMap[expert])):
    #             if i >= 4:
    #                 break
    #             sim = sim + 0.04 * expertMap[expert][i]
    #         expertScoreMap[expert] = sim
    #     return sorted(expertScoreMap.items(), key=lambda item: item[1], reverse=True)

    def get_model(self):
        return self.wm

    def get_cuttor(self):
        return self.cuttor