Python DBControllerの例、DBController.DBController Pythonの例

コード例 #1

0

ファイルを表示

ファイル: Main.py プロジェクト: hhldiniz/pooptbank

 def __btn_callback(self, btn):
     if btn == "Entrar":
         # db_controller = DBController(host="ds133856.mlab.com", port=33856, db_name="pooptbank", db_user="******",
         #                              db_pass="******")
         db_controller = DBController(host="localhost", port=27017, db_name="pooptbank", db_user="",
                                      db_pass="")
         username = self.get_app_gui().getEntry("Username")
         password = self.get_app_gui().getEntry("Password")
         data = db_controller.select_data_single('users', {'username': username, 'password': password})
         if data is not None and data['username'] == username and data['password'] == password:
             home_view = HomeView(View.get_app_gui(self), "Home", data['username'], str(data['balance']))
             home_view.show("Home")
         else:
             error_view = ErrorView(View.get_app_gui(self), "Erro")
             error_view.show("Erro")
     elif btn == "Sair":
         View.get_app_gui(self).stop()
     elif btn == "Cadastrar":
             signup_view = None
             try:
                 signup_view = Signup(View.get_app_gui(self), btn)
                 signup_view.show(btn)
             except ItemLookupError:
                 signup_view.show(btn)
     elif btn == "Configurar":
         config_view = Configuration(View.get_app_gui(self), btn)
         config_view.show(btn)

コード例 #2

0

ファイルを表示

    def setUp(self):
        self.database = mongomock.MongoClient().db
        post1 = {
            "_id": "393",
            "name": "software",
            "email_address": "*****@*****.**",
            "password": "******"
        }

        self.collection_admin = self.database.create_collection(
            "Administrator")
        self.collection_admin.insert_one(post1)

        post2 = {
            "_id": "123",
            "name": "Terry",
            "email_address": "*****@*****.**",
            "password": "******"
        }
        self.collection_member = self.database.create_collection("Member")
        self.collection_member.insert_one(post2)

        post3 = {
            "_id": "888",
            "name": "meeting",
            "start_time": "Apr10",
            "end_time": "Apr11",
            "location": "case"
        }

        self.collection_activity = self.database.create_collection("Activity")
        self.collection_activity.insert_one(post3)

        self.db = DBController(self.collection_member, self.collection_admin,
                               self.collection_activity)

コード例 #3

0

ファイルを表示

ファイル: DatabaseSeeder.py プロジェクト: phygitalism/phyge

    def __seed_books(cls):

        out_path = 'phy-books/out'
        data_path = out_path + '/articles_books.json'

        if not os.path.exists(out_path):
            os.makedirs(out_path)

        if not os.path.isfile(data_path):
            print('Resource books does not exist! Сreation is in progress...')

            with open('phy-books/phy_books.json', 'r',
                      encoding='utf8') as fh:  # собранные с сайта МИФ данные
                books = json.load(fh)
            book_fetcher = BooksFetcher(books)
            phy_books = book_fetcher.create_phy_book()
            books_list = []
            for obj in phy_books:
                books_list.append(obj.serialize())

            with open(data_path, 'w+', encoding='utf8'
                      ) as file:  # сереализованные обьекты PhyBooks
                json.dump(books_list, file, indent=2)
            print('Resource created')

        with open(data_path, 'r', encoding='utf8') as data_file:
            books = json.load(data_file)

            for index, book in enumerate(books):
                phy_book = PhyBook(book)
                print(
                    f'add {index + 1} of the {len(books)} books: {phy_book.title}'
                )
                if phy_book is not None:
                    DBController.add_document(phy_book, str(uuid.uuid4()))

コード例 #4

0

ファイルを表示

ファイル: FeatureGenerator.py プロジェクト: exsonic/BillboardPredictor

 def fillMissingValue(self, featureDict):
     db = DBController()
     for k,v in featureDict.iteritems():
         if v is None:
             allFeatureList = db.getAllFeatureListBySong(featureDict['id'])
             valueList = []
             for featureVector in allFeatureList:
                 if featureVector[k] is not None:
                     valueList.append(featureVector[k])
             if len(valueList) != 0:
                 featureDict[k] = sum(valueList) / float(len(valueList))
             else:
                 if k == 'radio' or k == 'streaming' or k == 'sales':
                     featureDict[k] = featureDict['rank']
                 elif k == 'MVView':
                     featureDict[k] = 7
                 elif k == 'MVSocialInteraction':
                     featureDict[k] = 4
                 elif k == 'MTVReviewCount':
                     featureDict[k] = 10
                 elif k == 'MTVReviewScore':
                     featureDict[k] = 0
                 elif k == 'youtubeCommentCount':
                     featureDict[k] = 100
                 elif k == 'youtubeCommentScore':
                     featureDict[k] = 3
                 elif k == 'twitterCount':
                     featureDict[k] = 100
                 elif k == 'twitterScore':
                     featureDict[k] = 0
     return featureDict

コード例 #5

0

ファイルを表示

ファイル: Crawler.py プロジェクト: shubhamchugh/APKCrawler

    def __init__(self, is_desktop):
        """
        생성자
        is_desktop : 서버환경에서 실행시키는지, 데크스탑환경(GUI)에서 실행시키는지\
                     (true, false)
        """
        # config.ini파일의 변수 가져오기
        config = configparser.ConfigParser()
        config.read('config.ini')
        self.apk_directory = config.get('Setting','APK_DIRECTORY')
        self.is_desktop = is_desktop

        # 서버모드로 실행시켰다면 가상디스플레이 실행
        if(not is_desktop):
            self.display = Display(visible=0, size=(800, 600))
            self.display.start()

        # 크롬 드라이버 실행 
        self.chrome = webdriver.Chrome(config.get('Setting',\
            'CHROME_DRIVER_DIRECTORY'))

        # 크롤링할 디렉토리 리스트 저장
        self.category_list = config.items('PlayStoreURL')

        # 데이터를 저장하고 제어할 DBController객체 생성
        self.db_connector = DBController(config.get('Setting','DB_DIRECTORY'))

        # 메타데이터가 저장될 SQLite 테이블 생성
        self.db_connector.create_table()

コード例 #6

0

ファイルを表示

ファイル: apkpure_crawler.py プロジェクト: munhyunsu/ApplicationPerformance

    def __init__(self, is_desktop):
        """
        생성자
        is_desktop : 서버환경에서 실행시키는지, 데크스탑환경(GUI)에서 실행시키는지\
                     (true, false)
        """
        # config.ini파일의 변수 가져오기
        config = configparser.ConfigParser()
        config.read('config.ini')
        self.apk_directory = config.get('Setting', 'APK_DIRECTORY')
        os.makedirs(self.apk_directory, exist_ok=True)
        self.is_desktop = is_desktop

        # 서버모드로 실행시켰다면 가상디스플레이 실행
        chrome_options = webdriver.ChromeOptions()
        if (not is_desktop):
            self.display = Display(visible=0, size=(1024, 768))
            self.display.start()
            chrome_options.add_argument('--headless')

        # 크롬 드라이버 실행
        self.chrome = webdriver.Chrome(config.get('Setting',
                                                  'CHROME_DRIVER_DIRECTORY'),
                                       chrome_options=chrome_options)
        self.chrome.set_window_size(1024, 768)
        #self.chrome.set_page_load_timeout(30)

        # 크롤링할 디렉토리 리스트 저장
        self.category_list = config.items('PlayStoreURL')

        # 데이터를 저장하고 제어할 DBController객체 생성
        self.db_connector = DBController(config.get('Setting', 'DB_DIRECTORY'))

        # 메타데이터가 저장될 SQLite 테이블 생성
        self.db_connector.create_table()

コード例 #7

0

ファイルを表示

ファイル: RegressionModel.py プロジェクト: exsonic/BillboardPredictor

 def computeBaseLine(self, baselineType=0):
     iterWeek, endWeek = datetime(2013, 3, 23), datetime(2013, 4, 20)
     db = DBController()
     fg = FeatureGenerator()
     baselineScore = 0
     while iterWeek <= endWeek:
         lastWeek = iterWeek - timedelta(weeks=1)
         featureList = db.getFeatureListByWeek(iterWeek)
         y_pred, y_test = [], []
         for featureVector in featureList:
             songId = featureVector["id"]
             lastWeekRank = db.getTop50Rank(lastWeek, songId)
             if lastWeekRank is None:
                 lastWeekScore = 0
             else:
                 lastWeekScore = fg.rankToPopScore(lastWeekRank)
             currentWeekRank = featureVector["rank"]
             currentWeekScore = fg.rankToPopScore(currentWeekRank) if currentWeekRank is not None else lastWeekScore
             y_pred.append(lastWeekScore)
             y_test.append(currentWeekScore)
         y_pred, y_test = self.getRankArray(numpy.asarray(y_pred)), self.getRankArray(numpy.asarray(y_test))
         if baselineType == 0:
             baselineScore += self.getRankEvalationScore(y_pred, y_test)
         elif baselineType == 1:
             baselineScore += metrics.r2_score(y_pred, y_test)
         else:
             baselineScore += metrics.mean_squared_error(y_pred, y_test)
         iterWeek += timedelta(weeks=1)
     baselineScore = baselineScore / 5
     print baselineScore

コード例 #8

0

ファイルを表示

ファイル: MusicReviewsExtractor.py プロジェクト: exsonic/BillboardPredictor

 def extractReviewsToBD(self, songList):
     db = DBController()
     for i, song in enumerate(songList):
         print i
         try:
             review = self.extractReviewFromMTV(song)
             db.insertMTVReviewToDB(song["id"], review)
         except Exception as e:
             print e
             continue

コード例 #9

0

ファイルを表示

ファイル: IMVDBDataExtractor.py プロジェクト: exsonic/BillboardPredictor

 def extractDataToDB(self, songList):
     db = DBController()
     for song in songList:
         try:
             URL = self.getURL(song['title'], song['artist'])
             viewStatDataList, socialInteractionDataList, detailStatDataDict = self.extractDataFromIMVDB(URL)
             db.insertIMVDBDataToDB(song['id'], viewStatDataList, socialInteractionDataList, detailStatDataDict)
         except Exception as e:
             print e
             continue

コード例 #10

0

ファイルを表示

ファイル: FileUtils.py プロジェクト: exsonic/CorpusAnalysis

def loadBrokerageToDB(filePath):
	with open(filePath, 'rU') as f:
		db = DBController()
		reader = csv.reader(f)
		for i, line in enumerate(reader):
			if i == 0:
				continue
			line = [word.strip() for word in line]
			brokerageDict = {'_id' : line[1], 'name' : line[0], 'code' : line[2]}
			db.saveBrokerage(brokerageDict)

コード例 #11

0

ファイルを表示

ファイル: SentenceClusterer.py プロジェクト: exsonic/CorpusAnalysis

class SentenceClusterer(object):
    def __init__(self):
        self.db = DBController()
        self.clusterer = KMeans(n_clusters=TOTAL_CLUSTER)
        
    def train(self, X):
        self.clusterer.fit(X)
        
    def predict(self, X):    
        return self.clusterer.predict(X)
        
    def updateSentenceCluster(self, clusterList, sentenceIdList):
        for sentenceId, cluster in zip(sentenceIdList, clusterList):
            self.db.updateSentenceCluster(sentenceId, int(cluster))
      
    def clusterSentenceInBatch(self, startId=0, limit=5000):
        endId, lastId = startId + BATCH_SIZE, startId + limit
        while endId < lastId:
            sentences = self.db.getSentenceInRange(startId, endId)
            self.clusterSentence(sentences)
            startId += BATCH_SIZE
            endId += BATCH_SIZE
    
    def clusterSentence(self, sentences):
        sentenceMatrix, sentenceIdList = self.getSentenceMatrixAndIdList(sentences)
        self.train(sentenceMatrix)
        clusterList = self.predict(sentenceMatrix)
        self.updateSentenceCluster(clusterList, sentenceIdList)
    
    def getSentenceMatrixAndIdList(self, sentences):
        table = self.db.getUnigramTable()
        matrix, idList = [], []
        i = 0
        for sentence in sentences:
            print i
            i += 1
            wordList = getProcessedWordList(sentence['content'])
            vector = self.getSentenceVector(table, wordList)
            matrix.append(vector)
            idList.append(sentence['_id'])
        return numpy.array(matrix), idList
    
    def getSentenceVector(self, table, wordList):
        wordIndexList = [table[word] for word in wordList]
        wordIndexList.sort()
        vector = [0] * len(table)
        for index in wordIndexList:
            vector[index] += 1
        return vector
    

# if __name__ == '__main__':
#     sc = SentenceClusterer()
#     sc.clusterSentenceInBatch(30000, 1000)

コード例 #12

0

ファイルを表示

ファイル: YoutubeCommentsExtractor.py プロジェクト: exsonic/BillboardPredictor

 def extractYoutubeCommentsToDB(self, songList):
     db = DBController()
     for song in songList:
         try:
             searchVideoName =  song['title'] + ' ' + song['artist']
             videoID = self.getVideoID(searchVideoName)
             comments = self.getComments(videoID)
             db.insertCommentToDB(song['id'], comments)
         except Exception as e:
             print e
             continue

コード例 #13

0

ファイルを表示

ファイル: RegressionModel.py プロジェクト: exsonic/BillboardPredictor

 def outputTopX(self, songIdList, rankList, x=10):
     if x > 40:
         raise Exception("x must be less than 40")
     db = DBController()
     songList = []
     for i in range(1, x + 1):
         try:
             index = rankList.index(i)
             songId = songIdList[index]
             songList.append(db.getSongById(songId))
         except:
             continue
     return songList

コード例 #14

0

ファイルを表示

ファイル: ArticleFetcher.py プロジェクト: phygitalism/phyge

    async def download_article(self, url: str, sem) -> PhyWebArticle:
        async with sem:
            article_html = await self.load_html(url)
            if len(article_html) > 0:
                article = self.parse_html(url, article_html)
                if len(article.normalized_words) == 0:
                    print(f'url {url} PARSE ERR')
                    article = None
            else:
                article = None

            if article is not None:
                DBController.add_document(article, str(uuid.uuid4()))

コード例 #15

0

ファイルを表示

ファイル: TrainingSample.py プロジェクト: phygitalism/phyge

 def __getitem__(self, key):
     if isinstance(key, slice):
         return [
             BaseArticle(DBController.get_article(self.articles_id[ii]))
             for ii in range(*key.indices(len(self)))
         ]
     elif np.issubdtype(type(key), np.integer):
         if key < 0:
             key += len(self)
         if key < 0 or key >= len(self):
             raise IndexError("The index {} is out of range.".format(key))
         return BaseArticle(DBController.get_article(self.articles_id[key]))
     else:
         raise TypeError("Invalid argument type.")

コード例 #16

0

ファイルを表示

ファイル: Crawler.py プロジェクト: himanshudas/APKCrawler

    def __init__(self, is_desktop):
        config = configparser.ConfigParser()
        config.read('config.ini')
        self.apk_directory = config.get('Setting', 'APK_DIRECTORY')
        self.is_desktop = is_desktop
        if (not is_desktop):
            self.display = Display(visible=0, size=(800, 600))
            self.display.start()

        self.chrome = webdriver.Chrome(config.get('Setting',\
            'CHROME_DRIVER_DIRECTORY'))
        self.category_list = config.items('PlayStoreURL')
        self.db_connector = DBController(config.get('Setting', 'DB_DIRECTORY'))
        self.db_connector.create_table()

コード例 #17

0

ファイルを表示

ファイル: Main.py プロジェクト: mayercf4473/OOTPIntel

def main():
    print "Welcome to stats importer!"
    parser = argparse.ArgumentParser()
    parser.add_argument("--schema", help="Schema name", default="ootp_players")
    parser.add_argument("--config", help="Json config file")
    parser.add_argument("--playerFile", help="Player file to import")
    args=parser.parse_args()
    BaseModel.TheDatabase.init(args.schema, user='******', password='******')
    BaseModel.TheDatabase.connect()
    dbController = DBController()
    dbController.checkInit()
    importer = PlayersImporter(LeagueConsts(args.config))
    #importer = StatsImporter(2014)
    importer.doImport(args.playerFile)

コード例 #18

0

ファイルを表示

ファイル: SalesChartExtractor.py プロジェクト: exsonic/BillboardPredictor

 def extractSalesRankToDB(self, beginDate=datetime.today(), endDate=datetime.today()):
     if beginDate < datetime(2007, 1, 1) or endDate > datetime.today():
         raise Exception('Invalid input date!')
     beginDate = dateToSaturday(beginDate)
     endDate = dateToSaturday(endDate)
     endDate = endDate - timedelta(days=7) if endDate > datetime.today() else endDate
     iterDate = beginDate
     db = DBController()
     while iterDate <= endDate:
         if db.checkSalesRankExistInDB(iterDate):
             iterDate = iterDate + timedelta(days = 7)
             continue
         URL = self.getURL(iterDate)
         chart = self.getSalesChartFromURL(URL)
         db.insertSalesChartToDB(iterDate, chart)
         iterDate = iterDate + timedelta(days = 7)

コード例 #19

0

ファイルを表示

ファイル: Exporter.py プロジェクト: exsonic/DialogueAnalysis

	def __init__(self, taskQueue, resultQueue, *args):
		super(ProcessThread, self).__init__()
		self._taskQueue = taskQueue
		self._resultQueue = resultQueue
		self._args = args
		self._executeFunction = None
		self._db = DBController()

コード例 #20

0

ファイルを表示

ファイル: Utils.py プロジェクト: exsonic/PRAnalysis

def loadCompeletedCodingFile(filePath):
	db = DBController()
	with open(filePath, 'rU') as f:
		reader = csv.reader(f)
		keyList = ['_id', 'OC_ID', 'OUTCOME_ID', 'CAUSE_ID', 'PR_ID', 'NAME', 'OUTCOME', 'FAVORABILITY', 'CAUSE', 'LOCUS_CAUSALITY', 'CONTROLLABILITY']
		for i, line in enumerate(reader):
			if i == 0:
				continue
			try:
				line[7] = int(line[7])
				line[9] = int(line[9])
				line[10] = int(line[10])
				sentenceDict = dict(zip(keyList ,line))
				db.saveCompletedSentence(sentenceDict)
			except:
				pass

コード例 #21

0

ファイルを表示

def check_db_status():
    db_len = 0
    for _ in DBController.get_all_articles():
        db_len += 1
    if db_len == 0:
        print('Seeding database...')
        DatabaseSeeder.seed()

コード例 #22

0

ファイルを表示

ファイル: FileUtils.py プロジェクト: exsonic/CorpusAnalysis

def loadAllRTFToDB(folderPath):
	db = DBController()
	for dirPath, dirNames, fileNames in os.walk(folderPath):
		for fileName in fileNames:
			if not fileName.endswith('.rtf'):
				continue
			filePath = os.path.join(dirPath, fileName)
			print(filePath)
			try:
				doc = Rtf15Reader.read(open(filePath))
				text = PlaintextWriter.write(doc).getvalue()
			except:
				continue
			lines = [line.strip() for line in text.split('\n') if line]
			articleLinesDict, articleStartIndex = {}, 0
			for i, line in enumerate(lines):
				if line.startswith('Document ') and len(line.split(' ')) == 2:
					articleId = line.split(' ')[-1]
					articleLinesDict[articleId] = lines[articleStartIndex : i]
					articleStartIndex = i + 1

			for articleId, lines in articleLinesDict.iteritems():
				bylineIndex, wordCountIndex, textStartIndex = -1, -1, -1
				for i, line in enumerate(lines):
					line = line.lower()
					if line.startswith('by '):
						bylineIndex = i
					elif line.endswith(' words'):
						wordCountIndex = i
					elif line == 'english':
						textStartIndex = i + 2

				if wordCountIndex == -1 or textStartIndex == -1 or wordCountIndex > textStartIndex:
					print(filePath + ', ' + articleId)
				else:
					articleDict = {'_id': articleId,
					               'filePath' : filePath.split('Marshall_RA/')[-1],
					               'headline': ' '.join(lines[: wordCountIndex]) if bylineIndex == -1 else ' '.join(lines[: bylineIndex]),
					               'byline' : '' if bylineIndex == -1 else lines[bylineIndex],
					               'date' : parser.parse(lines[wordCountIndex + 1]),
					               'sourceName' : lines[wordCountIndex + 2] if lines[wordCountIndex + 2].find(' AM') == -1 and lines[wordCountIndex + 2].find(' PM') == -1 else lines[wordCountIndex + 3],
					               'leadParagraph' : '',
					               'tailParagraph' : '\n'.join(lines[textStartIndex:]),
					               'sourceCode' : '', 'industry' : [], 'region' : [], 'newsSubject' : [], 'company' : []}
					db.saveArticle(articleDict)

コード例 #23

0

ファイルを表示

ファイル: FileUtils.py プロジェクト: exsonic/CorpusAnalysis

def loadAllXMLtoDB(inputDir):
	#have folder and p, pa info, insert after get
	db = DBController()
	for dirName, _, fileNames in os.walk(inputDir):
		print(dirName)
		for fileName in fileNames:
			try:
				if not fileName.endswith('xml'):
					continue
				fileAbsPath = getAbsPath(dirName, fileName)
				for articleDict in parseArticleFromXML(fileAbsPath):
					#duplication check
					# if db.isArticleDuplicate(articleDict['tailParagraph']):
					# 	continue
					articleDict['filePath'] = fileAbsPath.split('Marshall_RA/')[1]
					db.saveArticle(articleDict)
			except Exception as e:
				print e, dirName, fileName

コード例 #24

0

ファイルを表示

ファイル: FeatureGenerator.py プロジェクト: exsonic/BillboardPredictor

 def getFeatureMatrix(self, beginWeek, endWeek=datetime.today(), mode=0, withSongId=False):
     if beginWeek < datetime(2007, 1, 1) or endWeek > datetime.today():
         raise Exception('Invalid input date!')
     beginWeek, endWeek = dateToSaturday(beginWeek), dateToSaturday(endWeek)
     endWeek = endWeek - timedelta(days=7) if endWeek > datetime.today() else endWeek
     iterWeek = beginWeek
     db = DBController()
     matrix = []
     while iterWeek <= endWeek:
         featureList = db.getFeatureListByWeek(iterWeek)
         for featureDict in featureList:
             featureVector = self.featureDictToList(featureDict, mode, withSongId)
             if featureVector is None:
                 continue
             else:
                 matrix.append(featureVector)
         iterWeek += timedelta(weeks=1)
     matrix = numpy.matrix(matrix)
     return matrix

コード例 #25

0

ファイルを表示

ファイル: StudentDao.py プロジェクト: hhldiniz/pooptbank

 def save(self):
     try:
         db_controller = DBController()
         address = BaseUser.get_address(self)
         query = db_controller.insert_data(
             'users', {
                 'username': BaseUser.get_name(self),
                 'password': BaseUser.get_password(self),
                 'address': address.get_street(),
                 'cpf': BaseUser.get_cpf(self),
                 'admin': False,
                 'balance': 0
             })
         if query is None:
             return False
         else:
             return True
     except:
         return False

コード例 #26

0

ファイルを表示

    def load_model(cls, path: str, model_name: str,
                   model_type: str) -> BaseModel:
        print(f'{model_name}.{model_type} model loading...')
        if model_type != 'd2v':
            dictionary = corpora.Dictionary.load(
                os.path.join(path, f'{model_name}.dict'))
            corpus = corpora.MmCorpus(os.path.join(path, f'{model_name}.mm'))
        if model_type == 'ft':
            similarity_matrix = sparse.load_npz(
                os.path.join(path, f'{model_name}.mat.npz'))
        articles_id = cls.load_articles_id(path)
        articles = DBController.get_all_articles(
            {'serial_id': {
                '$in': articles_id
            }})
        training_sample = TrainingSample(articles)

        def load_func(model_path: str, model_type: str):
            if model_type == 'lsi':
                model = models.lsimodel.LsiModel.load(model_path)
                return LsiModel.trained(name=model_name,
                                        model=model,
                                        corpus=corpus,
                                        dictionary=dictionary,
                                        training_sample=training_sample)
            elif model_type == 'lda':
                model = models.ldamodel.LdaModel.load(model_path)
                return LdaModel.trained(name=model_name,
                                        model=model,
                                        corpus=corpus,
                                        dictionary=dictionary,
                                        training_sample=training_sample)
            elif model_type == 'd2v':
                model = models.doc2vec.Doc2Vec.load(model_path)
                return D2vModel.trained(name=model_name,
                                        model=model,
                                        corpus=None,
                                        dictionary=None,
                                        training_sample=training_sample)
            elif model_type == 'ft':
                model = models.FastText.load(model_path)
                # similarity_matrix = sparse.load_npz(os.path.join(path, f'{model_name}.mat.npz'))
                return FastTextModel.trained(
                    name=model_name,
                    model=model,
                    corpus=corpus,
                    dictionary=dictionary,
                    similarity_matrix=similarity_matrix,
                    training_sample=training_sample)

        model = load_func(os.path.join(path, f'{model_name}.{model_type}'),
                          model_type=model_type)
        print('Loaded')

        return model

コード例 #27

0

ファイルを表示

ファイル: Utils.py プロジェクト: exsonic/PRAnalysis

def loadPRFiles(folderPath):
	db = DBController()
	for dirPath, dirNames, fileNames in os.walk(folderPath):
		for fileName in fileNames:
			if not fileName.endswith('TXT.txt'):
				continue
			filePath = os.path.join(dirPath, fileName)
			fileNameParts = fileName.split('.')[0].split('_')
			articleDict = {'_id':fileName.split('.')[0], 'code' : fileNameParts[0], 'year' : int(fileNameParts[1]), 'quarter' : fileNameParts[2]}
			with open(filePath, 'rU') as f:
				articleDict['text'] = ('\n '.join(f.readlines())).decode('utf-8', 'ignore')
			try:
				db.savePRArticle(articleDict)
			except:
				pass


#if __name__ == '__main__':
	#loadCompeletedCodingFile('Corpus/completed-coding.csv')
	#loadPRFiles('/Users/exsonic/Dropbox/Marshall_RA/ENRON/SP500_PR_1999_2004')

コード例 #28

0

ファイルを表示

ファイル: DataExporter.py プロジェクト: exsonic/CorpusAnalysis

	def __init__(self, taskQueue, resultQueue, *args):
		super(DataProcessorThread, self).__init__()
		self._taskQueue = taskQueue
		self._resultQueue = resultQueue
		self._args = args
		self._executeFunction = None

		self._db = DBController()
		self._citeWordList = getWordList(WORD_CITE)
		if not os.path.exists('export/'):
			os.makedirs('export/')

コード例 #29

0

ファイルを表示

class PyPass:
    def __init__(self):
        self.__dbController = DBController('localhost', 27017)

    def set_username(self, username):
        hashedUsername = SecurityManager.get_hash(username)
        self.__username = hashedUsername
        self.__dbController.set_username(hashedUsername)

    def store_entry(self, id, ciphered_user, ciphered_pass):
        new_entry = Entry(id, ciphered_user, ciphered_pass)
        self.__dbController.storeEntry(new_entry)

    def get_entry(self, identifier):
        entry = self.__dbController.getEntry(SecurityManager.get_hash(identifier))
        username = self.__securityManager.decipher_field(entry.username)
        password = self.__securityManager.decipher_field(entry.password)

        return {'identifier': identifier, 'username': username, 'password': password}

    def create_entry(self, id, username, password):
        ciphered_user = self.__securityManager.cipher_field(username)
        ciphered_password = self.__securityManager.cipher_field(password)
        hashed_id = SecurityManager.get_hash(id)
        self.store_entry(hashed_id, ciphered_user, ciphered_password)

    def generate_new_entry(self, id, username):
        ciphered_user = self.__securityManager.cipher_field(username)
        ciphered_pass = self.__securityManager.generate_password()
        ciphered_id = self.__securityManager.cipher_field(id)
        self.store_entry(ciphered_id, ciphered_user, ciphered_pass)

    def create_account(self, username, masterPass):
        hashedUsername = SecurityManager.get_hash(username)
        hashedMasterPass = SecurityManager.get_hash(SecurityManager.get_hash(masterPass))
        return self.__dbController.createUser(hashedUsername, hashedMasterPass)

    def auth_user(self, username, masterPass):
        hashedUsername = SecurityManager.get_hash(username)
        hashedMasterPass = SecurityManager.get_hash(SecurityManager.get_hash(masterPass))
        if self.__dbController.authUser(hashedUsername, hashedMasterPass):
            self.__dbController.set_username(hashedUsername)
            self.__securityManager = SecurityManager(masterPass)
            return True
        else:
            return False

コード例 #30

0

ファイルを表示

ファイル: MysqlConnectionPool.py プロジェクト: ZhiguoRen/Python_Utils

 def initialize_pool(self):
     #todo:需要另一队列来监控正在使用的connection
     self.pool = Queue.Queue(maxsize=self.max_pool_size)
     if self.conn_at_start:
         for _ in range(0, self.max_pool_size):
             self.pool.put_nowait(
                 DBController(host=self.conf["host"],
                              db_name=self.conf["db_name"],
                              db_user_name=self.conf["db_user_name"],
                              psd=self.conf["db_psw"],
                              port=self.conf["port"]))
             self.current_conn_size += 1

コード例 #31

0

ファイルを表示

ファイル: SignifierParser.py プロジェクト: exsonic/CorpusAnalysis

	def __init__(self):
		self.db = DBController()
		self.pfmWord = getWordList(WORD_PFM)
		self.posWord = getWordList(WORD_POS)
		self.negWord = getWordList(WORD_NEG)
		self.exWord = getWordList(ATRB_EX)
		self.inWord = getWordList(ATRB_IN)
		self.citeWord = getWordList(WORD_CITE)
		self.engagerList = list(self.db.getAllEngager())
		self.companyList = list(self.db.getAllCompany())

		self.engagerRegexPatternDict, self.companyRegexPatternDict = self.getRegexPatternDictForEngagerAndCompany()

コード例 #32

0

ファイルを表示

class Menu:
    def __init__(self, bot):
        self.state = 0
        self.bot = bot
        self.db = DBController()
        self.mainTemplate = ""

    def initialMessage(self):
        today = datetime.today()
        reminders = self.db.getReminders(where='''DAY(reminder_datetime)={} AND
        MONTH(reminder_datetime)={} AND YEAR(reminder_datetime)={} ORDER BY
         reminder_datetime'''.foramt(str(today.day), str(today.month),
                                     str(today.year)))
        tasks = self.db.getTasks()
        text = ""
        startHour = 5
        startMinute = '0'
        if reminders[0][2].hours > 5:
            startHour = reminders[0][2].hours
        for i in range((24 - startHour) * 2):
            s = '{}:{}0  {}  \n'.format(
                str(startHour),
                startMinute,
            )
        self.bot.send_message(config.MAIN_USER_ID,
                              text,
                              disable_notification=False)

    async def send_message(self,
                           user_id: int = config.MAIN_USER_ID,
                           text: str = 'Hi',
                           disable_notification: bool = False) -> bool:
        try:
            await self.bot.send_message(
                user_id, text, disable_notification=disable_notification)
        except:
            return True
        return False

コード例 #33

0

ファイルを表示

 def __init__(self, app, title, user, balance):
     SubWindow.__init__(self, app, title)
     SubWindow.set_size(self, "500x400")
     SubWindow.add_label(self, "Bem vindo," + user + " .")
     SubWindow.add_btns(
         self,
         ["Historico", "Trocas", "Transferencias", "Deposito", "Logout"],
         self.__btn_callback)
     SubWindow.add_label(self, "Seu saldo é " + balance)
     SubWindow.add_label(self, "Sua ultima transação foi ...")
     db_controller = DBController()
     all_users = db_controller.select_data("users")
     all_users_array = []
     for obj in all_users:
         all_users_array.append(obj['username'])
     self.historicoView = HistoricoView(SubWindow.get_app_gui(self),
                                        "Historico")
     self.trocasView = TrocasView(SubWindow.get_app_gui(self), "Trocas")
     self.transferenciasView = TransferenciasView(
         SubWindow.get_app_gui(self), "Transferencias", all_users_array,
         user)
     self.depositoView = DepositoView(SubWindow.get_app_gui(self),
                                      "Deposito")

コード例 #34

0

ファイルを表示

 def execute(self):
     try:
         db_controller = DBController()
         user1_balance = db_controller.select_data_single(
             "users", {"username": self.__user1})['balance']
         user2_balance = db_controller.select_data_single(
             "users", {"username": self.__user2})['balance']
         db_controller.update_data(
             "users", {'username': self.__user1},
             {'balance': user1_balance - self.get_valor()})
         db_controller.update_data(
             "users", {'username': self.__user2},
             {'balance': user2_balance + self.get_valor()})
         return True
     except:
         return False

コード例 #35

0

ファイルを表示

ファイル: DatabaseSeeder.py プロジェクト: phygitalism/phyge

    def __seed_pdf_articles(cls):
        data_path = 'Resources/pdf_articles.json'

        if not os.path.isfile(data_path):
            print('Resource does not exist!')
            return

        with open(data_path, 'r', encoding='utf8') as data_file:

            data = json.load(data_file)

            for index, article_data in enumerate(data):
                title = article_data['title']
                text = article_data['text']
                normalized_words = TextNormalizer.normalize(text)
                article = PhyPdfArticle({
                    **article_data, 'lang': 'en',
                    'normalized_words': normalized_words
                })

                print(f'add {index + 1} of the {len(data)} articles: {title}')

                if article is not None:
                    DBController.add_document(article, str(uuid.uuid4()))

コード例 #36

0

ファイルを表示

ファイル: MysqlConnectionPool.py プロジェクト: ZhiguoRen/Python_Utils

    def get_dbc(self):
        # returns a db instance when one is available else waits until one is
        if not self.conn_at_start and self.current_conn_size < self.max_pool_size:
            dbc = DBController(host=self.conf["host"],
                               db_name=self.conf["db_name"],
                               db_user_name=self.conf["db_user_name"],
                               psd=self.conf["db_psw"],
                               port=self.conf["port"])
            if not dbc:
                print "cannot generate dbccontroller"
                return None
            self.current_conn_size += 1
            dbc.new_cur()
            return dbc

        dbc = self.pool.get(True)
        dbc.new_cur()
        return dbc

コード例 #37

0

ファイルを表示

ファイル: DataExporter.py プロジェクト: exsonic/CorpusAnalysis

class DataProcessorThread(Thread):
	def __init__(self, taskQueue, resultQueue, *args):
		super(DataProcessorThread, self).__init__()
		self._taskQueue = taskQueue
		self._resultQueue = resultQueue
		self._args = args
		self._executeFunction = None

		self._db = DBController()
		self._citeWordList = getWordList(WORD_CITE)
		if not os.path.exists('export/'):
			os.makedirs('export/')

	def exportSentenceAnalysis(self):
		#sentence collection is all the sentence
		#deprecated, need to refactor and apply queue
		with open('export/sentence.csv', 'wb') as f:
			writer = csv.writer(f)
			sentences = self._db.getAllSentence()
			articleDict = {}
			attributeList = ['id', 'cotic', 'coname', 'filePath', 'accessionNo', 'content', 'coname','ceoname', 'cite',
			                 'co_c', 'ceo_c', 'analyst_c', 'pfm', 'pfm_words', 'pos', 'pos_words', 'neg', 'neg_words',
			                 'internal', 'int_words', 'external', 'ext_words',
			                 'quote_sen', 'analyst']
			writer.writerow(attributeList)
			for i, sentence in enumerate(sentences):
				try:
					print(i)
					if sentence['articleId'] not in articleDict:
						articleDict[sentence['articleId']] = self._db.getArticleById(sentence['articleId'])
					article = articleDict[sentence['articleId']]
					articlePathPartList = article['filePath'].split('/')
					articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2]
					articleCompany = self._db.getCompanyByCode(articleCompanyCode)
					articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name']
					sentenceCompanyList = [self._db.getCompanyById(companyId) for companyId in sentence['company']]
					sentenceCompanyNameString = ','.join([company['shortName'] for company in sentenceCompanyList])
					sentenceEngagerList = [self._db.getEngagerById(engagerId) for engagerId in sentence['engager']]
					CEOList = filter(lambda engager : engager['type'] == ENGAGER_CEO, sentenceEngagerList)
					analystList =  filter(lambda engager : engager['type'] == ENGAGER_ANALYST, sentenceEngagerList)
					CEONameString = ','.join([CEO['lastName'] for CEO in CEOList])
					citeWordString = ','.join(sentence['cite'])
					citeCompany, citeCEO, citeAnalyst = int(sentence['citeCompany']), int(sentence['citeCEO']), int(sentence['citeAnalyst'])
					pfmWordString = ','.join(sentence['pfm'])
					posWordString = ','.join(sentence['pos'])
					negWordString = ','.join(sentence['neg'])
					inWordString = ','.join(sentence['in'])
					exWordString = ','.join(sentence['ex'])
					quoteString = getQuotedString(sentence['content'])
					analystSurroundString = getStringSurroundWordInDistance(sentence['content'], 'analyst', ANALYST_SURROUND_DISTANCE)
					lineList = [sentence['_id'], articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], sentence['content'].encode('utf-8'),
					            sentenceCompanyNameString, CEONameString, citeWordString, citeCompany, citeCEO, citeAnalyst,
					            len(sentence['pfm']), pfmWordString, len(sentence['pos']), posWordString, len(sentence['neg']), negWordString,
					            len(sentence['in']), inWordString, len(sentence['ex']), exWordString,
					            quoteString, analystSurroundString]

					writer.writerow(lineList)
				except Exception as e:
					print(e)

	def exportArticleAnalysis(self):
		#deprecated
		with open('export/article.csv', 'wb') as f:
			writer = csv.writer(f)
			articleList = list(self._db.getAllArticle())
			attributeList = ['cotic', 'coname', 'filePath', 'accessNo', 'date', 'source', 'byline',
			                 'coname1', 'coname2', 'coname3', 'coname4', 'coname5',
			                 'subjectCode1', 'subjectCode2', 'subjectCode3', 'subjectCode4', 'subjectCode5']
			writer.writerow(attributeList)
			for i, article in enumerate(articleList):
				try:
					print(i)
					articlePathPartList = article['filePath'].split('/')
					articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2]
					articleCompany = self._db.getCompanyByCode(articleCompanyCode)
					articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name']
					companyCodeList = [''] * ARTICLE_EXPORT_CODE_SIZE
					subjectCodeList = [''] * ARTICLE_EXPORT_CODE_SIZE
					if 'company' in article:
						for i, companyCode in enumerate(article['company']):
							if i >= ARTICLE_EXPORT_CODE_SIZE:
								break
							companyCodeList[i] = companyCode
					else:
						article['company'] = [articleCompanyCode]
						companyCodeList = article['company']

					if 'newsSubject' in article:
						for i, subjectCode in enumerate(article['newsSubject']):
							if i >= ARTICLE_EXPORT_CODE_SIZE:
								break
							subjectCodeList[i] = subjectCode
					else:
						article['newsSubject'] = []
						subjectCodeList = article['newsSubject']

					self._db.saveArticle(article)

					lineList = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'], article['byline']] + companyCodeList + subjectCodeList
					writer.writerow(lineList)
				except Exception as e:
					print(e)

	def processKeywordSearch(self):
		searchString = self._args[0]
		while True:
			article = self._taskQueue.get()
			if article == END_OF_QUEUE:
				break
			else:
				articlePathPartList = article['filePath'].split('/')
				articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2]
				articleCompany = self._db.getCompanyByCode(articleCompanyCode)
				articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name']
				articleSentenceList = []

				#here, use '|' to combine regex is OK, because sentence is short, will not reduce the performance that much.
				#But in DB search, use iterative way.
				pattern = getPatternByKeywordSearchString(searchString)

				#on sentence level first, if can't find, go to paragraph level.
				for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]:
					sentenceList = sent_tokenize(paragraph)
					for sentence in sentenceList:
						if re.search(pattern, sentence) is not None:
							articleSentenceList.append(sentence.encode('utf-8').strip())
				if not articleSentenceList:
					#search on paragraph level
					for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]:
						if re.search(pattern, paragraph) is not None:
							articleSentenceList.append(paragraph.encode('utf-8').strip())
				lineList = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'].strip(), article['byline'].strip(), article['headline'].strip(), '\t'.join(articleSentenceList)]
				self._resultQueue.put(lineList)

	def processCitationBlock(self):
		#because list is too long, we need to separate name in to chunk
		brokerNameList = list(self._db.getAllBrokerageEffectiveNameList())
		brokerageNamePatternList = []
		for i in range(0, len(brokerNameList), 500):
			brokerageNamePatternList.append(re.compile(r'|'.join([r'\b' + name + r'\b' for name in brokerNameList[i : i + 500]]), re.IGNORECASE))

		quotePattern = re.compile(r'\"[^\"]+\"')
		citeWordPatternStringList = [(r'\b' + citeWord + r'\b') for citeWord in self._citeWordList]

		companyCEODict = self._db.getAllCompanyCEODict()
		engagerNamePattern = re.compile(r'|'.join(['CEO', 'analyst', 'executive']), re.IGNORECASE)
		citeWordPattern = re.compile(r'|'.join(citeWordPatternStringList), re.IGNORECASE)

		wordMatchPatternList = [getWordRegexPattern(WORD_CAUSE_IN), getWordRegexPattern(WORD_CAUSE_EX), getWordRegexPattern(WORD_CONTROL_LOW), getWordRegexPattern(WORD_CONTROL_HIGH), getWordRegexPattern(MCD_POS), getWordRegexPattern(MCD_NEG), getWordRegexPattern(MCD_UNCERTAIN)]
		filterWordDict = getWordDict(WORD_FILTER)
		while True:
			#process in batch
			articleBatch = self._taskQueue.get()
			if articleBatch == END_OF_QUEUE:
				self._taskQueue.task_done()
				break
			else:
				lineListBatch = []
				toProcessSentenceBatch = []
				sentenceTextIndex, NERStartIndex, NERPartCount, wordMatchStartIndex = 9, 12, 5, 18
				#add byline_cleaned in articleDict
				self.processBylineInBatch(articleBatch)
				for article in articleBatch:
					self._db.setArticleProcessed(article['_id'])
					articlePathPartList = article['filePath'].split('/')
					articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2]
					articleCompany = self._db.getCompanyByCode(articleCompanyCode)
					articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name']
					articleLineListPart = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'].strip(), article['byline'].strip(), article['byline_cleaned'], article['headline'].strip()]

					for paragraph in [article['leadParagraph'], article['tailParagraph']]:
						#if found qouted part in this paragraph
						quotedStringList = re.findall(quotePattern, paragraph)
						if quotedStringList and max([len(string.split()) for string in quotedStringList]) > 5:
							#Among all the quoted parts, the max word count MUST bigger than 5
							#If so, then get all sentences
							sentenceList = sent_tokenize(paragraph)
							for sentence in sentenceList:
								quotedStringList = re.findall(quotePattern, sentence)
								citeWordList = re.findall(citeWordPattern, sentence)
								#If this sentence has quotation and quoted part word cout is bigger than 5 and has cite word
								#Then parse it add to the export
								if quotedStringList and max([len(string.split()) for string in quotedStringList]) > 5 and citeWordList:
									lineList = articleLineListPart + [sentence, '. '.join(quotedStringList), ', '.join(citeWordList)] + [''] * NERPartCount + [len(sentence.split())] + [''] * len(wordMatchPatternList) * 2
									# Macth the keyword in dictionary
									for i, pattern in enumerate(wordMatchPatternList):
										matchedWordList = getMatchWordListFromPattern(sentence, pattern, filterWordDict)
										lineList[i + wordMatchStartIndex] = len(matchedWordList)
										lineList[i + len(wordMatchPatternList) + wordMatchStartIndex] = ', '.join(matchedWordList)
									lineListBatch.append(lineList)
									toProcessSentenceBatch.append(sentence)
				actorAndOrgListBatch = self.processCiteSentenceInBatch(toProcessSentenceBatch)
				for i, actorAndOrgList in enumerate(actorAndOrgListBatch):
					if actorAndOrgList is not None:
						engagerNameList = re.findall(engagerNamePattern, lineListBatch[i][sentenceTextIndex])
						FCEO = 0
						articleCompanyCode = lineListBatch[i][0]
						for name in actorAndOrgList[0].split(', '):
							for namePart in name.split():
								if articleCompanyCode in companyCEODict and companyCEODict[articleCompanyCode].find(namePart) != -1:
									FCEO = 1
						lineListBatch[i][NERStartIndex] = actorAndOrgList[0]
						lineListBatch[i][NERStartIndex + 1] = actorAndOrgList[1]
						lineListBatch[i][NERStartIndex + 2] = ' '.join(engagerNameList)
						lineListBatch[i][NERStartIndex + 3] = FCEO
						unQuotedPart = re.sub(r'"[^"]+"', '', lineListBatch[i][sentenceTextIndex])
						findBrokerage = False
						for pattern in brokerageNamePatternList:
							result = pattern.search(unQuotedPart)
							if result is not None and result.string[result.regs[0][0]].isupper():
								findBrokerage = True
								break
						lineListBatch[i][NERStartIndex + 4] = 1 if findBrokerage else 0
						self._resultQueue.put(lineListBatch[i])
				self._taskQueue.task_done()

	def getNERTaggedTupleListFromSentence(self, sentence):
		#use senna name entity tagger, it fast!!
		sentence = unicode(sentence).encode('utf-8', 'ignore')
		with open('temp/input.txt', 'w') as f:
			f.write(sentence)
		os.system('./senna/senna -path senna/ -ner <temp/input.txt> temp/output.txt')
		with open('temp/output.txt', 'r') as f:
			tagTupleList = [[word.strip().split('-')[-1] if i ==1  else word.strip() for i, word in enumerate(line.split())] for line in f.readlines() if line.split()]
		return tagTupleList

	def processBylineInBatch(self, articleBatch):
		#use '.' to replace '' of byline, because if the last sentence byline is '', it will not be add to concatenated string.
		tagTupleList = self.getNERTaggedTupleListFromSentence(' ****** '.join([article['byline'] if article['byline'] else 'null.' for article in articleBatch]))

		personList, lastTag, wordList  = [], '', []
		articleIndex = 0
		for i in range(len(tagTupleList)):
			if tagTupleList[i][1] != lastTag:
				if lastTag == 'PER':
					personList.append(' '.join(wordList))
				wordList = [tagTupleList[i][0]]
				lastTag = tagTupleList[i][1]
			else:
				wordList.append(tagTupleList[i][0])

			if tagTupleList[i][0].find('****') != -1 or i == len(tagTupleList) - 1:
				#end of one sentence
				articleBatch[articleIndex]['byline_cleaned'] = ', '.join(personList) if personList else ''
				personList, lastTag, wordList = [], '', []
				articleIndex = articleIndex + 1 if i != len(tagTupleList) - 1 else articleIndex
				if articleIndex >= len(articleBatch):
					return

		while articleIndex < len(articleBatch) :
			articleBatch[articleIndex]['byline_cleaned'] = ''
			articleIndex += 1


	def processCiteSentenceInBatch(self, sentenceBatch):
		tagTupleList = self.getNERTaggedTupleListFromSentence(' ****** '.join(sentenceBatch))

		personAndOrgListBatch = []
		personList, orgnizationList, inQuoteFlag, lastTag, wordList  = [], [], False, '', []
		for i in range(len(tagTupleList)):
			if tagTupleList[i][0] == '\"':
				inQuoteFlag = 1 - inQuoteFlag
				if not inQuoteFlag:
					del wordList[:]
			else:
				if not inQuoteFlag:
					if tagTupleList[i][1] != lastTag:
						if lastTag == 'PER':
							personList.append(' '.join(wordList))
						elif lastTag == 'ORG':
							orgnizationList.append(' '.join(wordList))
						wordList = [tagTupleList[i][0]]
						lastTag = tagTupleList[i][1]
					else:
						wordList.append(tagTupleList[i][0])

			if tagTupleList[i][0].find('****') != -1 or i == len(tagTupleList) - 1:
				#end of one sentence
				if not personList and not orgnizationList:
					personAndOrgListBatch.append(None)
				else:
					personAndOrgListBatch.append([', '.join(personList), ', '.join(orgnizationList)])
				personList, orgnizationList, inQuoteFlag, lastTag, wordList  = [], [], False, '', []

		return personAndOrgListBatch

	def run(self):
		self._executeFunction()

コード例 #38

0

ファイルを表示

ファイル: DataExporter.py プロジェクト: exsonic/FatSecret_Crawler

class DataExporter(object):
    def __init__(self):
        self.db = DBController()
    
    def valueToCSVFormat(self, value):
        if value == '' or value is None:
            return ''
        elif isinstance(value, str) or isinstance(value, unicode):
            return '\"' + value.encode('utf-8', 'ignore') + '\"'
        elif isinstance(value, int) or isinstance(value, float):
            return str(value)
        elif isinstance(value, datetime):
            return datetime.strftime(value, '%Y-%m-%d')
        else:
            raise Exception('Value data type ERROR, must be string, int, float or datetime')
        
    def exportHistory(self, isDietHistory):
        if isDietHistory:
            key = 'dietHistory'
            fileName = key + '.csv'
            attributeLine = 'id,name,count,lastUpdateDate,updateTime,food,RDI,fat,protein,carbs,exercise,net\n'
        else:
            key = 'weightHistory'
            fileName = key + '.csv'
            attributeLine = 'id,name,count,lastUpdateDate,lastUpdateWeight,startWeight,goalWeight,updateTime,weight\n'
        
        with open(fileName, 'w') as f:
            f.write(attributeLine)
            users = self.db.getAllUserList()
            users = sorted(users, key=lambda user : user['id'])
            for user in users:
                lineList = [user['id'], user['name']]
                try:
                    if key in user and user[key] is not None and len(user[key]) > 0:
                        lineList.append(len(user[key]))
                        lineList.append(user[key][-1][0])
                        if not isDietHistory:
                            lineList.append(user[key][-1][1])
                            lineList.append(user['startWeight'])
                            lineList.append(user['goalWeight'])
                        for detailInfoTuple in user[key]:
                            lineList.extend(detailInfoTuple)                            
                    else:
                        
                        lineList.append(0)
                    line = ','.join([self.valueToCSVFormat(value) for value in lineList]) + '\n'
                    f.write(line)
                except Exception as e:
                    print e, user['id']
    
    def exportGroupChallenge(self, isGroup):
        items = self.db.getAllGroupList() if isGroup else self.db.getAllChallengeList()
        items = sorted(items, key=lambda item : item['id'])
        fileName = 'group.csv' if isGroup else 'challenge.csv'
        with open(fileName, 'w') as f:
            f.write('id,name,count,memberId\n')
            for item in items:
                lineList = [item['id'], item['name']]
                if 'member' in item and item['member'] is not None:
                    lineList.append(len(item['member']))
                    lineList.extend(item['member'])
                else:
                    lineList.append(0)
                line = ','.join([self.valueToCSVFormat(value) for value in lineList]) + '\n'
                f.write(line)
    
#     def exportUserGroupChallenge(self, isGroup):
#         users = self.db.getAllUserIter()
#         directory = 'userGroup/' if isGroup else 'userChallenge/'
#         key = 'group' if isGroup else 'challenge'
#         if not os.path.exists(directory):
#             os.mkdir(directory)
#         for user in users:
#             fileName = directory + str(user['id']) + '.txt'
#             if key in user and user[key] is not None:
#                 with open(fileName, 'w') as f:
#                     itemList = sorted(user[key])
#                     for itemId in itemList:
#                         line = str(itemId) + '\n'
#                         f.write(line)
#             else:
#                 with open(fileName, 'w') as f:
#                     pass

    def exportUserGroupChallenge(self, isGroup):
        key = 'group' if isGroup else 'challenge'
        fileName = 'userGroup.csv' if isGroup else 'userChallenge.csv'
        attributeLine = 'id,count,group\n' if isGroup else 'id,count,challenge\n'
        with open(fileName, 'w') as f:
            users = self.db.getAllUserList()
            users = sorted(users, key=lambda user : user['id'])
            f.write(attributeLine)
            for user in users:
                lineList = [user['id']]
                if key in user and user[key] is not None:
                    lineList.append(len(user[key])) 
                    lineList.extend(user[key])
                else:
                    lineList.append(0)
                line = ','.join([self.valueToCSVFormat(value) for value in lineList]) + '\n'
                f.write(line)
    
    def exportBuddy(self):
        users = self.db.getAllUserIter()
        if not os.path.exists('buddy/'):
            os.mkdir('buddy/')
        for user in users:
            fileName = 'buddy/' + str(user['id']) + '.txt'
            if 'buddy' in user and user['buddy'] is not None:
                with open(fileName, 'w') as f:
                    buddyIdList = sorted(user['buddy'])
                    for userId in buddyIdList:
                        line = str(user['id']) + ' ' + str(userId) + '\n'
                        f.write(line)
            else:
                with open(fileName, 'w') as f:
                    pass

    def getUserIdNameDict(self):
        users = self.db.getAllUserIter()
        userIdNameDict = {}
        for user in users:
            userIdNameDict[user['id']] = user['name']
        return userIdNameDict

コード例 #39

0

ファイルを表示

ファイル: Utils.py プロジェクト: exsonic/DialogueAnalysis

def loadAllDialoguesFromFile(speakerTypeFilePath, folderPath):
	db = DBController()
	db.dropDB()
	ensuredIndex = False
	ADict, CDict, JDict, DotDict = {}, {}, {}, {}
	#load the speaker type csv file
	with open(speakerTypeFilePath, 'rU') as f:
		lines = csv.reader(f)
		for i, line in enumerate(lines):
			if i == 0:
				continue
			speakerName, speakerType, speakerId = line[14].strip(), line[15].strip().upper(), line[16].strip()
			if speakerType == TYPE_ANALYST:
				ADict[speakerName] = speakerId
			elif speakerType == TYPE_CEO:
				CDict[speakerName] = speakerId
			elif speakerType == TYPE_JOURNALIST:
				JDict[speakerName] = speakerId
			elif speakerType == TYPE_DOT:
				DotDict[speakerName] = speakerId
			else:
				print(speakerName, speakerType)


	for dirPath, dirNames, fileNames in os.walk(folderPath):
		print(dirPath)
		if os.path.split(dirPath)[-1].startswith('chunk'):
			for fileName in fileNames:
				try:
					if fileName.endswith('txt'):
						fileNameParts = [part.strip() for part in fileName.split('.txt')[0].split('_')]
						company, time = fileNameParts[0], fileNameParts[1]
						sessionType, sessionOrder, asker, answerer = fileNameParts[2], int(fileNameParts[3]), fileNameParts[4], fileNameParts[5]
						if fileNameParts[-1].endswith('default') or fileNameParts[-1].endswith('copy'):
							continue
						elif fileNameParts[-1][-1].isdigit() and not fileNameParts[-1][-2].isdigit():
							speakerName = fileNameParts[-1][:-1].strip()
							speechOrder = int(fileNameParts[-1][-1:])
						elif fileNameParts[-1][-1].isdigit() and fileNameParts[-1][-2].isdigit():
							speakerName = fileNameParts[-1][:-2].strip()
							speechOrder = int(fileNameParts[-1][-2:])
						else:
							continue
						conference =  db.getConferenceByCompanyTime(company, time)
						if conference is None:
							conference = {'company' : company, 'time' : time}
							conference = db.insertConference(conference)

						session = db.getSessionByConferenceAndOrder(conference['_id'], sessionOrder)
						if session is None:
							session = {'conference' : conference['_id'], 'order' : speechOrder, 'type' : sessionType, 'asker' : asker, 'answerer' : answerer}
							session = db.insertSession(session)

						speech = db.getSpeechByConferenceIdAndSessionIdAndOrder(conference['_id'], session['_id'], speechOrder)
						if speech is None:
							if speakerName in ADict:
								speakerType, speakerId = TYPE_ANALYST, ADict[speakerName]
							elif speakerName in CDict:
								speakerType, speakerId = TYPE_CEO, CDict[speakerName]
							elif speakerName in JDict:
								speakerType, speakerId = TYPE_JOURNALIST, JDict[speakerName]
							elif speakerName in DotDict:
								speakerType, speakerId = TYPE_DOT, DotDict[speakerName]
							else:
								speakerType, speakerId = TYPE_DOT, ''
								print(fileName, speakerName)

							filePath = os.path.join(dirPath, fileName)
							with open(filePath, 'rU') as f:
								text = ' '.join(f.readlines()).strip()
								text = text.decode('ascii', 'ignore').encode('ascii', 'ignore')
								speech = {'conference' : conference['_id'], 'session' : session['_id'], 'order' : speechOrder, 'text' : text,
								          'speakerName' : speakerName, 'speakerType' : speakerType, 'speakerId' : speakerId}
								db.insertSpeech(speech)
						if not ensuredIndex:
							db.ensureIndex()
							ensuredIndex = True
				except Exception as e:
					print(fileName)
					print(e)

コード例 #40

0

ファイルを表示

ファイル: FeatureGenerator.py プロジェクト: exsonic/BillboardPredictor

 def extractFeatureToDB(self, beginWeek, endWeek=datetime.today(), isReload=False, useAlchemyAPI=False):
     if beginWeek < datetime(2007, 1, 7) or endWeek > datetime.today():
         raise Exception('Invalid input date!')
     beginWeek, endWeek = dateToSaturday(beginWeek), dateToSaturday(endWeek)
     endWeek = endWeek - timedelta(days=7) if endWeek > datetime.today() else endWeek
     iterWeek = beginWeek
     db = DBController()
     while iterWeek <= endWeek:
         lastWeek = iterWeek - timedelta(days=7)
         songRankList = db.getSongIdListByWeek(lastWeek)
         for songId in songRankList:
             if isReload == False and db.isFeatureInDB(iterWeek, songId):
                 continue
             featureDict = {}
             featureDict['id'] = songId
             featureDict['week'] = iterWeek
             featureDict['sales'] = db.getSalesRank(lastWeek, songId)
             featureDict['radio'] = db.getRadioRank(lastWeek, songId)
             featureDict['streaming'] = db.getStreamingRank(lastWeek, songId)
             featureDict['MVView'], featureDict['MVSocialInteraction'] = db.getIMVDBData(iterWeek, songId)
             featureDict['MTVReviewCount'], featureDict['MTVReviewScore'] = db.getMTVReviewData(iterWeek, songId, useAlchemyAPI) 
             featureDict['youtubeCommentCount'], featureDict['youtubeCommentScore'] = db.getYoutubeData(iterWeek, songId, useAlchemyAPI)
             featureDict['twitterCount'], featureDict['twitterScore'] = db.getTwitterData(iterWeek, songId, useAlchemyAPI)
             featureDict['rank'] = db.getTop50Rank(iterWeek, songId)
             db.insertFeatureToDB(featureDict)
         iterWeek += timedelta(days=7)

コード例 #41

0

ファイルを表示

ファイル: Crawler.py プロジェクト: himanshudas/APKCrawler

class Crawler:
    def __init__(self, is_desktop):
        config = configparser.ConfigParser()
        config.read('config.ini')
        self.apk_directory = config.get('Setting', 'APK_DIRECTORY')
        self.is_desktop = is_desktop
        if (not is_desktop):
            self.display = Display(visible=0, size=(800, 600))
            self.display.start()

        self.chrome = webdriver.Chrome(config.get('Setting',\
            'CHROME_DRIVER_DIRECTORY'))
        self.category_list = config.items('PlayStoreURL')
        self.db_connector = DBController(config.get('Setting', 'DB_DIRECTORY'))
        self.db_connector.create_table()

    def __get_new_app_list(self, popular_url):
        """
        카테고리별 인기차트를 Selenium 를 사용해 300개 앱의
        메타정보를 가지고 온다.
        """
        self.chrome.get(popular_url)
        self.chrome.implicitly_wait(10)

        # 해당 페이지를 스크롤해야만 300위까지의 앱이 나타남
        for scroll in (10000, 20000, 30000, 40000, 50000):
            self.chrome.execute_script("window.scrollTo(0," \
                + str(scroll) + ");")
            time.sleep(2)

        package_list = []
        # selector를 사용해 300개의 앱 div를 가져옴
        div_app_list =  self.chrome.find_elements_by_css_selector(\
            ".card.no-rationale.square-cover.apps.small")
        for div_app in div_app_list:
            app_detail = div_app.find_element_by_class_name('details')
            url = app_detail.find_element_by_class_name('title')\
                .get_attribute('href')
            package_name = url.split('id=')[1]
            package_list.append(package_name)

        return package_list

    def __get_app_detail(self, package_list):
        """
        패키지 리스트를 입력으로 받아 앱별로 이름, 이미지소스,\
        업데이트날짜를 크롤링함
        """

        # 앱 상세정보 페이지에 들어가기위한 기본url
        # 뒤에 패키지 이름에 따라서 해당 앱 상세정보 페이지로 이동
        base_url = 'https://play.google.com/store/apps/details?id='
        detail_list = []

        for package in package_list:
            app_url = base_url + package

            self.chrome.get(app_url)
            self.chrome.implicitly_wait(10)

            try:
                name = self.chrome.\
                    find_element_by_css_selector('.id-app-title').text
                img_src = self.chrome.\
                    find_element_by_css_selector('.cover-image').\
                        get_attribute('src')
                updated_date = self.chrome.\
                    find_elements_by_css_selector('.content')[0].text
                ratings = self.chrome.find_elements_by_css_selector(
                    '.rating-count')[0].text
                if ',' in ratings:
                    ratings = ratings.replace(',', '')
            except:
                print(package + " 오류 발생")
                print(package + " name, img_src, update_date 가져오기 실패")
                continue

            # 마지막에 None은 isDownloaded 컬럼에 해당된다.
            detail_list.append([name, package, img_src, updated_date, False])

        return detail_list

    def __download_apk(self, package_name, download_url):
        """
        APK파일을 HTTP request를 통해 다운받는 함수
        리퀘스트를 보내는 도중 에러가 발생하면 False반환
        정상적으로 파일이 저장완료되면 True반환
        """
        file_name = str(package_name) + '.apk'
        try:
            r = requests.get(download_url, timeout=60)
            with open(self.apk_directory + file_name, 'wb') as apk:
                apk.write(r.content)
        except requests.exceptions.Timeout as e:
            print('time out')
            return False
        except Exception as e:
            print(e)
            return False
        return True

    def crawl_new(self):
        # 카레고리별 플레이스토어 인기차트 긁어오기
        for category in self.category_list:
            category_name = category[0]
            url = category[1]

            # Google Play Store를 크롤링하여 최신300개의 앱 메타정보를 가져오기
            new_package_list = self.__get_new_app_list(url)

            # 최신앱 메타정보로 갱신한 리스트를 입력으로 주고
            # 앱별로 상세정보를 크롤링함
            # 이름, 업데이트날짜, 이미지소스
            updated_app_list = self.__get_app_detail(new_package_list)

            # 새로 생긴된 데이터들을 DB에 업데이트
            self.db_connector.update_app(updated_app_list, category_name)

        self.db_connector.commit_n_close()

    def crawl_old(self):
        for category in self.category_list:
            category_name = category[0]
            url = category[1]

            # 기존 DB에 존재하던 카테고리별 패키지 리스트를 가져오기
            old_package_list = self.db_connector\
                .get_old_category_app_list(category)

            # 최신앱 메타정보로 갱신한 리스트를 입력으로 주고
            # 앱별로 상세정보를 크롤링함
            # 이름, 업데이트날짜, 이미지소스
            updated_app_list = self.__get_app_detail(old_package_list)

            # 새로 생긴된 데이터들을 DB에 업데이트
            self.db_connector.update_app(updated_app_list, category_name)
        self.db_connector.commit_n_close()

    def update_apk(self):
        not_updated_list = self.db_connector.not_updated_list()
        print(1)
        for package_row in not_updated_list:
            package_name = package_row[0]
            search_url = 'http://apkpure.com/search?q=' + package_name
            self.chrome.get(search_url)
            self.chrome.implicitly_wait(10)

            # 패키지명으로 검색하여 일치하는 앱 찾기
            search_titles = self.chrome.\
                find_elements_by_class_name('search-title')

            # APK pure사이트에서 검색이 되지 않는 APK는 통과
            if len(search_titles) == 0:
                logging.info(package_name + " is not searched")
                continue

            # 검색결과와 일치하는 앱은 href링크에 패키지 이름이 들어있음
            link = ''
            for title in search_titles:
                link = title.find_element_by_tag_name('a')
                link = link.get_attribute('href')

                if package_name in link:
                    break

            # 검색결과가 여러개 나오지만 일치하지 않는다면 통과
            if link == '':
                logging.info(package_name + ' is not searched in APKpure')
                continue

            print(link)  # debug
            self.chrome.get(link)
            self.chrome.implicitly_wait(10)

            a_list = self.chrome.find_elements_by_class_name(' down')
            try:
                for a in a_list:
                    link = a.get_attribute('href')
                    # href링크에 패키지 이름있는것이 있으면 발견!
                    if package_name in link:
                        self.chrome.get(link)
                        self.chrome.implicitly_wait(10)
                        break
                # 페이지 내부에 iframe을 못찾는 경우가 발생
                # 못찾는다면 해당 APK는 무시하고 다음APK로 이동
                iframe = self.chrome.find_element_by_id('iframe_download')

                src = iframe.get_attribute('src')
            except:
                logging.info(package_name + " does not have href or iframe")
                continue

            if (self.__download_apk(package_name, src)):
                self.db_connector.update_isdownload(package_name, True)
            else:
                self.db_connector.update_isdownload(package_name, False)

        self.db_connector.commit_n_close()

    def close(self):
        self.chrome.stop()
        if (not self.is_desktop):
            self.display.stop()

コード例 #42

0

ファイルを表示

ファイル: DataExporter.py プロジェクト: exsonic/CorpusAnalysis

	def __init__(self):
		self._resultQueue = Queue()
		self._taskQueue = Queue()
		self._db = DBController()
		self._threadNumber = 1
		self._threadList = []

コード例 #43

0

ファイルを表示

ファイル: SentenceClusterer.py プロジェクト: exsonic/CorpusAnalysis

 def __init__(self):
     self.db = DBController()
     self.clusterer = KMeans(n_clusters=TOTAL_CLUSTER)

コード例 #44

0

ファイルを表示

ファイル: DataExtractor.py プロジェクト: exsonic/FatSecret_Crawler

 def __init__(self):
     self.db = DBController()
     self.br = self.login()

コード例 #45

0

ファイルを表示

ファイル: TrainingSample.py プロジェクト: phygitalism/phyge

 def __iter__(self):
     for id in self.articles_id:
         yield BaseArticle(DBController.get_article(id))

コード例 #46

0

ファイルを表示

ファイル: YoutubeCommentsExtractor.py プロジェクト: exsonic/BillboardPredictor

            while commentFeed is not None:
                for comment in commentFeed.entry:
                    commentText = comment.content.text
                    commentDate = dateStringToSaturday(comment.updated.text)
                    commentList.append({'week' : commentDate, 'comment' : commentText})
                next_link = commentFeed.GetNextLink()
                if next_link is None:
                    commentFeed = None
                else:
                    commentFeed = self.client.GetYouTubeVideoCommentFeed(next_link.href)
        except Exception, e:
            print e
        return commentList
        
    def extractYoutubeCommentsToDB(self, songList):
        db = DBController()
        for song in songList:
            try:
                searchVideoName =  song['title'] + ' ' + song['artist']
                videoID = self.getVideoID(searchVideoName)
                comments = self.getComments(videoID)
                db.insertCommentToDB(song['id'], comments)
            except Exception as e:
                print e
                continue

if __name__ == '__main__':
    extractor = YoutubeCommentsExtractor()
    db = DBController()
    songList = db.getSongByWeek(lastSaturday())
    extractor.extractYoutubeCommentsToDB(songList)

コード例 #47

0

ファイルを表示

from Storage import Storage


def check_db_status():
    db_len = 0
    for _ in DBController.get_all_articles():
        db_len += 1
    if db_len == 0:
        print('Seeding database...')
        DatabaseSeeder.seed()


if __name__ == "__main__":
    check_db_status()

    articles = DBController.get_all_articles(limit=None)

    testing_sample = TrainingSample(articles)

    lsi = LsiModel(model_name='phyge')
    # lda = LdaModel(model_name='phyge')
    # d2v = D2vModel(model_name='phyge')
    # fast_text = FastTextModel(model_name='phyge')

    lsi.train_model(testing_sample)
    # lda.train_model(testing_sample)
    # d2v.train_model(testing_sample)
    # fast_text.train_model(testing_sample)

    Storage.save_model(lsi, path='out/lsi')
    # Storage.save_model(lda, path='out/lda')

コード例 #48

0

ファイルを表示

ファイル: FileUtils.py プロジェクト: exsonic/CorpusAnalysis

def loadEngagerAndCompanyToDB(filePath):
	with open(filePath, 'rU') as f:
		db = DBController()
		reader = csv.reader(f)
		for i, line in enumerate(reader):
			line = [word.strip() for word in line]
			if i == 0:
				continue
			if db.getEngagerByName(line[5]) is None:
				engagerDict = {'name' : line[5], 'lastName' : line[6], 'type' : ENGAGER_CEO, 'gender' : line[-1]}
				db.insertEngager(engagerDict)
			if db.getCompanyByName(line[3]) is None:
				engagerDict = db.getEngagerByName(line[5])
				companyDict = {'_id' : int(line[2]), 'name' : line[3], 'shortName' : line[4], 'code' : line[0], 'CEO' : {line[1] : engagerDict['_id']}}
				db.insertCompany(companyDict)
			else:
				engagerDict = db.getEngagerByName(line[5])
				companyDict = db.getCompanyByName(line[3])
				companyDict['CEO'][line[1]] = engagerDict['_id']
				db.updateCompanyCEO(companyDict['_id'], companyDict['CEO'])


		for name in ['CEO', 'Executive']:
			engagerDict = {'name' : name, 'lastName' : name, 'type' : ENGAGER_CEO}
			db.insertEngager(engagerDict)
		for name in ['analyst']:
			engagerDict = {'name' : name, 'lastName' : name, 'type' : ENGAGER_ANALYST}
			db.insertEngager(engagerDict)

コード例 #49

0

ファイルを表示

 def __init__(self):
     self.__dbController = DBController('localhost', 27017)

コード例 #50

0

ファイルを表示

ファイル: DatabaseSeeder.py プロジェクト: phygitalism/phyge

 def seed(cls):
     DBController.first_setup()
     cls.__seed_web_articles()

コード例 #51

0

ファイルを表示

                        wrong_ids[model_name].append(
                            (true_sourse, found_sourse))
                output_answer.append(
                    dict(true_sourse=true_sourse,
                         sourse=found_sourse,
                         model=model_name,
                         title=answer[model_name][0]['title'],
                         similarity=answer[model_name][0]['similarity']))
        file.write(json.dumps(output_answer, indent=2, ensure_ascii=False))
        file.write(json.dumps(wrong_ids, indent=2, ensure_ascii=False))


if __name__ == "__main__":
    log_of_result = []

    if len(DBController.get_all_documents()) == 0:
        print('Seeding database...')
        DatabaseSeeder.seed()

    lsi = Storage.load_model('out/lsi', 'phyge', 'lsi')

    lda = Storage.load_model('out/lda', 'phyge', 'lda')
    d2v = Storage.load_model('out/d2v', 'phyge', 'd2v')
    fast_text = Storage.load_model('out/fast_text', 'phyge', 'ft')
    search_engine = SearchEngine(models=[fast_text, d2v, lda, lsi])
    test_path = os.path.join(PhyVariables.testsDir,
                             'test_' + str(PhyVariables.queriesId))
    run_search(os.path.join(test_path, PhyVariables.queriesFileName),
               os.path.join(test_path, PhyVariables.answersFileName), 1)
    # run_search('Resources/pdf_articles.json','Resources/answers.json',1)

コード例 #52

0

ファイルを表示

ファイル: index.py プロジェクト: jun10000/onkyori_listener

# Author: jun10000 (https://github.com/jun10000)
#

import os
import signal
import sys
import time
from collections import deque
from RPi import GPIO

import settings
import JunLib
import wave_signal
from DBController import DBController

db_controller = DBController()


def initialise():
    GPIO.setmode(GPIO.BCM)
    GPIO.setup(settings.INPUT_PIN, GPIO.IN)

    global db_controller
    db_controller.connect()


buffer = deque()


# noinspection PyUnusedLocal
def loop(signal_num, frame):

コード例 #53

0

ファイルを表示

ファイル: DataExporter.py プロジェクト: exsonic/CorpusAnalysis

class DataExporterMaster():
	def __init__(self):
		self._resultQueue = Queue()
		self._taskQueue = Queue()
		self._db = DBController()
		self._threadNumber = 1
		self._threadList = []


	def exportAllCitationBlock(self):
		#single thread is enough
		attributeList = ['cotic', 'coname', 'filePath', 'accessNo', 'date', 'source', 'byline', 'byline_cleaned', 'headline', 'sentence', 'cite_content', 'cite_word', 'actor', 'organization', 'engager', 'FCEO', 'broker']
		attributeList += ['total_word_count', 'cau_int', 'cau_ext', 'cont_l', 'cont_h', 'pos', 'neg', 'uncert']
		attributeList += ['cau_int_words', 'cau_ext_words', 'cont_l_words', 'cont_h_words', 'pos_words', 'neg_words', 'uncert_words']
		#Comment this line if you wanna continue last time work and set the write mode to append 'a'
		self._db.setAllArticleUnprocessed()
		writer = CSVWriterThread(self._resultQueue, 'export/allCitationSentence.csv', attributeList, mode='w')
		writer.start()

		#must set to 100, otherwise there's bug
		batchSize = 100
		for i in range(self._threadNumber):
			t = DataProcessorThread(self._taskQueue, self._resultQueue)
			t._executeFunction = t.processCitationBlock
			t.start()
			self._threadList.append(t)

		while True:
			isDone = False
			for i in range(self._threadNumber):
				articleBatch = list(self._db.getUnprocessedArticleInBatch(batchSize))
				if articleBatch is None or not articleBatch:
					isDone = True
					break
				self._taskQueue.put(articleBatch)
			self._taskQueue.join()
			print('################')
			if isDone:
				break

		for i in range(self._threadNumber):
			self._taskQueue.put(END_OF_QUEUE)

		self._taskQueue.join()
		for t in self._threadList:
			t.join()
		self._resultQueue.put(END_OF_QUEUE)
		self._resultQueue.join()
		writer.join()

	def exportKeywordSearch(self, searchString):
		self._threadNumber = 4
		attributeList = ['cotic', 'coname', 'filePath', 'accessNo', 'date', 'source', 'byline', 'headline', 'sentence']
		writer = CSVWriterThread(self._resultQueue, 'export/keywordSearch.csv', attributeList)
		writer.start()

		for i in range(self._threadNumber):
			t = DataProcessorThread(self._taskQueue, self._resultQueue, searchString)
			t._executeFunction = t.processKeywordSearch
			t.start()
			self._threadList.append(t)

		articleListCursor = self._db.getAllArticleBySearchString(searchString)
		#it's cursor here!!
		for article in articleListCursor:
			self._taskQueue.put(article)

		for i in range(self._threadNumber):
			self._taskQueue.put(END_OF_QUEUE)

		for t in self._threadList:
			t.join()
		self._resultQueue.put(END_OF_QUEUE)
		writer.join()

コード例 #54

0

ファイルを表示

ファイル: SignifierParser.py プロジェクト: exsonic/CorpusAnalysis

class SignifierParser(object):

	def __init__(self):
		self.db = DBController()
		self.pfmWord = getWordList(WORD_PFM)
		self.posWord = getWordList(WORD_POS)
		self.negWord = getWordList(WORD_NEG)
		self.exWord = getWordList(ATRB_EX)
		self.inWord = getWordList(ATRB_IN)
		self.citeWord = getWordList(WORD_CITE)
		self.engagerList = list(self.db.getAllEngager())
		self.companyList = list(self.db.getAllCompany())

		self.engagerRegexPatternDict, self.companyRegexPatternDict = self.getRegexPatternDictForEngagerAndCompany()

	def getRegexPatternDictForEngagerAndCompany(self):
		engagerRegexPatternDict, companyRegexPatternDict = {}, {}
		for engager in self.engagerList:
			if engager['lastName'] == 'Jones' or engager['lastName'] == 'Johnson' or engager['lastName'] == 'West' or engager['lastName'] == 'Post' or engager['lastName'] == 'Ford':
				searchName = engager['name']
			else:
				searchName = engager['lastName']
			engagerRegexPatternDict[engager['_id']] = re.compile(r'\b' + searchName + r'\b')

		for company in self.companyList:
			companyRegexPatternDict[company['_id']] = re.compile(r'\b' + company['shortName'] + r'\b', re.IGNORECASE)

		return engagerRegexPatternDict, companyRegexPatternDict

	def extractAllSentenceToDB(self, isReload=False):
		if isReload:
			self.db.dropSentence()
		# for company in self.companies:
		for i, company in enumerate(self.companyList):
			articles = list(self.db.getAllArticleByCompanyCode(company['code']))
			engagers = list(self.db.getAllEngagerByCompanyId(company['_id']))
			for j, article in enumerate(articles):
				print(i, j)
				paragraphSet = ('leadParagraph', 'tailParagraph')
				for key in paragraphSet:
					paragraph = article[key]
					sentenceList = sent_tokenize(paragraph)
					for string in sentenceList:
						if not isValidSentence(string):
							continue
						sentenceDict = {'content' : string.encode('utf-8'), 'articleId' : article['_id'], 'paragraph' : key}
						sentenceDict = self.parseRawSentence(sentenceDict, engagers)
						if sentenceDict is not None:
							self.db.insertSentence(sentenceDict)

	def parseRawSentence(self, sentenceDict, engagerList):
		engagerIdList, companyIdList = [], []
		for engager in engagerList:
			if self.engagerRegexPatternDict[engager['_id']].search(sentenceDict['content']) is not None:
				engagerIdList.append(engager['_id'])

		for company in self.companyList:
			if self.companyRegexPatternDict[company['_id']].search(sentenceDict['content']) is not None:
				companyIdList.append(company['_id'])

		if not engagerIdList and not companyIdList:
			return None
		else:
			sentenceDict['engager'] = list(set(engagerIdList))
			sentenceDict['company'] = list(set(companyIdList))
			return sentenceDict

	def parseAllSentenceCitation(self):
		sentences = list(self.db.getAllSentence())
		for i, sentence in enumerate(sentences):
			print(i)
			words = getProcessedWordList(sentence['content'], VERB)
			sentence['cite'] = filter(lambda  word : word in self.citeWord, words)
			sentence['citeCEO'], sentence['citeAnalyst'], sentence['citeCompany'] = self.isCiteInDistance(sentence)
			self.db.saveSentence(sentence)

	def parseAllSentencePfm(self):
		#list them all, becaue if loop with cursor and update cursor pointed sentence at meantime, the cursor will be screwed.
		sentences = list(self.db.getAllSentence())
		for i, sentence in enumerate(sentences):
			print(i)
			pfmSentenceWordList = getProcessedWordList(sentence['content'], NOUN)
			pfmWordList = filter(lambda word : word in self.pfmWord, pfmSentenceWordList)
			posNegSentenceWordList = getProcessedWordList(sentence['content'], VERB)
			posWordList = filter(lambda word : word in self.posWord, posNegSentenceWordList)
			negWordList = filter(lambda word : word in self.negWord, posNegSentenceWordList)

			posWordList, negWordList = self.filterPosNegWordListByDistance(pfmSentenceWordList, posNegSentenceWordList, pfmWordList, posWordList, negWordList)

			self.db.updateSentencePfm(sentence['_id'], pfmWordList, posWordList, negWordList)

	def parseAllSentenceAtrb(self):
		sentences = list(self.db.getAllSentence())
		for i, sentence in enumerate(sentences):
			print(i)
			words = getProcessedWordList(sentence['content'], NOUN)
			exWordList = filter(lambda word : word in self.exWord, words)
			inWordList = filter(lambda word : word in self.inWord, words)
			if ('ceo' in inWordList or 'executive' in inWordList) and sentence['cite']:
				inWordList = []
			self.db.updateSentenceAtrb(sentence['_id'], exWordList, inWordList)

	def isCiteInDistance(self, sentence):
		#if (CEO or Company) and citation word happens within 5 word distance, capture
		isCiteCEO, isCiteAnalyst, isCiteCompany = False, False, False
		if sentence['cite']:
			wordList = getProcessedWordList(sentence['content'], VERB)
			for citeWord in sentence['cite']:
				citeIndex = wordList.index(citeWord)
				for engagerId in sentence['engager']:
					try:
						engager = self.db.getEngagerById(engagerId)
						matchName = engager['lastName'].lower()
						engagerIndex = wordList.index(matchName)
						if abs(citeIndex - engagerIndex) <= CITE_DISTANCE:
							if engager['type'] == ENGAGER_CEO:
								isCiteCEO = True
							else:
								isCiteAnalyst = True
					except:
						pass

				for companyId in sentence['company']:
					try:
						company = self.db.getCompanyById(companyId)
						matchName = company['shortName'].lower()
						companyIndex = wordList.index(matchName)
						if abs(citeIndex - companyIndex) <= CITE_DISTANCE:
							isCiteCompany = True
					except:
						pass
		return isCiteCEO, isCiteAnalyst, isCiteCompany

	def filterPosNegWordListByDistance(self, pfmSentenceWordList, posNegSentenceWordList, pfmWordList, posWordList, negWordList):
		filteredPosWordList, filteredNegWordList = [],[]
		for pfmWord in pfmWordList:
			pfmIndex = pfmSentenceWordList.index(pfmWord)
			for posWord in posWordList:
				posIndex = posNegSentenceWordList.index(posWord)
				if abs(pfmIndex - posIndex) <= PFM_DISTANCE:
					filteredPosWordList.append(posWord)
			for negWord in negWordList:
				negIndex = posNegSentenceWordList.index(negWord)
				if abs(pfmIndex - negIndex) <= PFM_DISTANCE:
					filteredNegWordList.append(negWord)
		return filteredPosWordList, filteredNegWordList

コード例 #55

0

ファイルを表示

ファイル: Main.py プロジェクト: hhldiniz/pooptbank

 def __init__(self, title="Poopt Bank", size="500x400"):
     View.__init__(self, title, size)
     View.set_btn_callback(self, self.__btn_callback)
     db_controller = DBController("localhost", "pooptbank")

コード例 #56

0

ファイルを表示

class test_DBController(unittest.TestCase):
    # def setUp(self):
    #     self.collection = mongomock.MongoClient().db.collection
    #     m1 = Member("name_test", "id_111", "email", "password")
    #     post1 = {"_id": "123", "name": "graves", "date": "apr.9"}
    #     self.collection.insert_one(post1)

    def setUp(self):
        self.database = mongomock.MongoClient().db
        post1 = {
            "_id": "393",
            "name": "software",
            "email_address": "*****@*****.**",
            "password": "******"
        }

        self.collection_admin = self.database.create_collection(
            "Administrator")
        self.collection_admin.insert_one(post1)

        post2 = {
            "_id": "123",
            "name": "Terry",
            "email_address": "*****@*****.**",
            "password": "******"
        }
        self.collection_member = self.database.create_collection("Member")
        self.collection_member.insert_one(post2)

        post3 = {
            "_id": "888",
            "name": "meeting",
            "start_time": "Apr10",
            "end_time": "Apr11",
            "location": "case"
        }

        self.collection_activity = self.database.create_collection("Activity")
        self.collection_activity.insert_one(post3)

        self.db = DBController(self.collection_member, self.collection_admin,
                               self.collection_activity)

    def test_member_is_present(self):
        self.assertTrue(self.db.member_is_present("123"))
        self.assertFalse(self.db.member_is_present("000"))

    def test_add_member(self):
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        # member1 is not in the collection
        self.assertTrue(self.db.add_member(member1))
        # member1 already exists
        self.assertFalse(self.db.add_member(member1))
        # member2 already exists when construct
        member2 = Member("Terry", "123", "*****@*****.**", "pass")
        self.assertFalse(self.db.add_member(member2))

    def test_update_member(self):
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.assertFalse(self.db.update_member(member1))
        member2 = Member("Terry", "123", "*****@*****.**", "change_password")
        self.assertTrue(self.db.update_member(member2))

    def test_retrieve_member(self):
        self.assertEqual(self.collection_member.find_one({"_id": "123"}),
                         self.db.retrieve_member("123"))
        self.assertIsNone(self.db.retrieve_member("000"))

    def test_retrieve_member_name(self):
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_member(member1)
        self.assertEqual(self.db.retrieve_member_name("123"), "Terry")
        self.assertEqual(self.db.retrieve_member_name("847"), "Marcus")

    def test_delete_member(self):
        self.assertTrue(self.db.delete_member("123"))
        self.assertFalse(self.db.delete_member("123"))

    def test_member_login(self):
        self.assertTrue(self.db.member_login("123", "pass"))
        self.assertFalse(self.db.member_login("00", "pass"))
        self.assertFalse(self.db.member_login("123", "wrong_password"))

    def test_clubs_member_added(self):
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_member(member1)
        # self.assertIsNotNone(self.db.clubs_member_added("123"))
        self.assertIsNotNone(self.db.clubs_member_added("847"))
        self.assertIsNone(self.db.clubs_member_added("000"))

    def test_add_club_to_member(self):
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_member(member1)
        self.assertTrue(self.db.add_club_to_member("393", "847"))
        self.assertFalse(self.db.add_club_to_member("393", "847"))
        self.assertFalse(self.db.add_club_to_member("000", "847"))

    def test_remove_club_from_member(self):
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_member(member1)
        self.db.add_club_to_member("393", "847")
        self.assertTrue(self.db.remove_club_from_member("393", "847"))
        self.assertFalse(self.db.remove_club_from_member("000", "847"))

    def test_request_permission(self):
        admin1 = Administrator("terry", "110", "*****@*****.**", "pass")
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_admin(admin1)
        self.db.add_member(member1)
        self.assertTrue(
            self.db.request_permission("110", "847", "*****@*****.**",
                                       "Marcus"))
        self.assertFalse(
            self.db.request_permission("000", "847", "*****@*****.**",
                                       "Marcus"))
        self.assertFalse(
            self.db.request_permission("110", "847", "*****@*****.**",
                                       "Marcus"))

    def test_admin_is_present(self):
        self.assertTrue(self.db.admin_is_present("393"))
        self.assertFalse(self.db.admin_is_present("000"))

    def test_update_member_face_id(self):
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_member(member1)
        self.assertTrue(self.db.update_member_face_id("847", "face_id"))
        self.assertFalse(self.db.update_member_face_id("000", "face_id"))

    def test_retrieve_member_face_id(self):
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_member(member1)
        self.db.update_member_face_id("847", "face_id")
        self.assertEqual(self.db.retrieve_member_face_id("847"), "face_id")

    def test_add_admin(self):
        admin1 = Administrator("new_admin", "000", "*****@*****.**",
                               "new_password")
        self.assertTrue(self.db.add_admin(admin1))
        self.assertFalse(self.db.add_admin(admin1))

    def test_update_admin(self):
        admin1 = Administrator("software", "393", "*****@*****.**",
                               "new_password")
        self.assertTrue(self.db.update_admin(admin1))
        admin2 = Administrator("new_admin", "000", "*****@*****.**",
                               "new_password")
        self.assertFalse((self.db.update_admin(admin2)))

    def test_retrieve_admin(self):
        self.assertEqual(self.db.retrieve_admin("393"),
                         self.collection_admin.find_one({"_id": "393"}))
        self.assertIsNone(self.db.retrieve_admin("000"))

    def test_delete_admin(self):
        self.assertTrue(self.db.delete_admin("393"))
        self.assertFalse(self.db.delete_admin("000"))

    def test_admin_login(self):
        self.assertTrue(self.db.admin_login("393", "pass"))
        self.assertFalse(self.db.admin_login("393", "wrong_pass"))
        self.assertFalse(self.db.admin_login("000", "pass"))

    def test_add_member_to_added_members(self):
        admin1 = Administrator("terry", "110", "*****@*****.**", "pass")
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_admin(admin1)
        self.db.add_member(member1)
        self.assertTrue(self.db.add_member_to_added_members("110", "847"))
        self.assertFalse(self.db.add_member_to_added_members("110", "847"))
        self.assertFalse(self.db.add_member_to_added_members("000", "847"))

    def test_remove_member_from_added_members(self):
        admin1 = Administrator("terry", "110", "*****@*****.**", "pass")
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_admin(admin1)
        self.db.add_member(member1)
        self.db.add_member_to_added_members("110", "847")
        self.assertFalse(self.db.remove_member_from_added_members(
            "000", "847"))
        self.assertFalse(self.db.remove_member_from_added_members(
            "110", "000"))
        self.assertTrue(self.db.remove_member_from_added_members("110", "847"))

    def test_add_member_to_pending_members(self):
        admin1 = Administrator("terry", "110", "*****@*****.**", "pass")
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_admin(admin1)
        self.db.add_member(member1)
        self.assertTrue(self.db.add_member_to_pending_members("110", "847"))
        self.assertFalse(self.db.add_member_to_pending_members("000", "847"))

    def test_remove_member_from_pending_members(self):
        admin1 = Administrator("terry", "110", "*****@*****.**", "pass")
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_admin(admin1)
        self.db.add_member(member1)
        self.db.add_member_to_pending_members("110", "847")
        self.assertFalse(
            self.db.remove_member_from_pending_members("000", "847"))
        self.assertFalse(
            self.db.remove_member_from_pending_members("110", "000"))
        self.assertTrue(
            self.db.remove_member_from_pending_members("110", "847"))

    def test_added_members(self):
        admin1 = Administrator("terry", "110", "*****@*****.**", "pass")
        self.db.add_admin(admin1)
        self.assertIsNotNone(self.db.added_members("110"))
        self.assertEqual([], self.db.added_members("110"))

    def test_pending_members(self):
        admin1 = Administrator("terry", "110", "*****@*****.**", "pass")
        self.db.add_admin(admin1)
        self.assertIsNotNone(self.db.pending_members("110"))
        self.assertEqual([], self.db.pending_members("110"))

    def test_permit(self):
        admin1 = Administrator("terry", "110", "*****@*****.**", "pass")
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_admin(admin1)
        self.db.add_member(member1)
        self.db.add_member_to_pending_members("110", "847")
        self.assertTrue(self.db.permit("*****@*****.**", "110", "terry"))

    def test_reject(self):
        admin1 = Administrator("terry", "110", "*****@*****.**", "pass")
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_admin(admin1)
        self.db.add_member(member1)
        self.db.add_member_to_pending_members("110", "847")
        self.assertTrue(self.db.reject("*****@*****.**", "110", "terry"))

    def test_activity_is_present(self):
        self.assertTrue(self.db.activity_is_present("888"))
        self.assertFalse(self.db.activity_is_present("000"))

    def test_add_activity(self):
        activity1 = Activity("999", "party", datetime(2020, 2, 2, 3, 20),
                             datetime(2020, 2, 2, 4, 30), "case")
        self.assertTrue(self.db.add_activity(activity1, "393"))
        self.assertFalse(self.db.add_activity(activity1, "393"))

    def test_update_activity(self):
        activity1 = Activity("888", "party", datetime(2020, 2, 2, 3, 20),
                             datetime(2020, 2, 2, 4, 30), "case")
        activity2 = Activity("000", "party", datetime(2020, 2, 2, 3, 20),
                             datetime(2020, 2, 2, 4, 30), "case")
        self.assertTrue(self.db.update_activity(activity1))
        self.assertFalse(self.db.update_activity(activity2))

    def test_retrieve_activity(self):
        self.assertEqual(self.collection_activity.find_one({"_id": "888"}),
                         self.db.retrieve_activity("888"))
        self.assertIsNone(self.db.retrieve_activity("000"))

    def test_delete_activity(self):
        self.assertTrue(self.db.delete_activity("888"))
        self.assertFalse(self.db.delete_activity("000"))

    def test_add_activity_to_member(self):
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_member(member1)
        self.assertTrue(
            self.db.add_activity_to_member("393", "888", "847", "on_time"))
        self.assertFalse(
            self.db.add_activity_to_member("393", "888", "847", "on_time"))

    def test_set_member_activity_status(self):
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_member(member1)
        self.db.add_activity_to_member("393", "888", "847", "on_time")
        self.assertTrue(
            self.db.set_member_activity_status("393", "888", "847", "on_time"))
        self.assertFalse(
            self.db.set_member_activity_status("393", "000", "847", "on_time"))

    def test_remove_activity_from_member(self):
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_member(member1)
        self.db.add_activity_to_member("393", "888", "847", "on_time")
        self.assertTrue(
            self.db.remove_activity_from_member("393", "888", "847"))
        self.assertFalse(
            self.db.remove_activity_from_member("393", "888", "847"))

    def test_add_activity_to_admin(self):
        admin1 = Administrator("terry", "110", "*****@*****.**", "pass")
        self.db.add_admin(admin1)
        self.assertTrue(self.db.add_activity_to_admin("888", "110"))
        self.assertFalse(self.db.add_activity_to_admin("888", "000"))

    def test_remove_activity_from_admin(self):
        admin1 = Administrator("terry", "110", "*****@*****.**", "pass")
        self.db.add_admin(admin1)
        self.db.add_activity_to_admin("888", "110")
        self.assertTrue(self.db.remove_activity_from_admin("888", "110"))
        self.assertFalse(self.db.remove_activity_from_admin("888", "110"))

    def test_member_status_in_activity(self):
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_member(member1)
        self.db.add_activity_to_member("393", "888", "847", "on_time")
        self.db.set_member_activity_status("393", "888", "847", "on_time")
        self.assertEqual(
            self.db.member_status_in_activity("847", "393", "888"), "on_time")

    def test_member_activities(self):
        member1 = Member("Marcus", "847", "*****@*****.**", "password")
        self.db.add_member(member1)
        self.assertEqual(self.db.member_activities("847"), [])
        self.assertIsNone(self.db.member_activities("000"))

    def test_admin_activities(self):
        admin1 = Administrator("new_admin", "000", "*****@*****.**",
                               "new_password")
        self.db.add_admin(admin1)
        self.assertEqual(self.db.admin_activities("000"), [])
        self.assertIsNone(self.db.admin_activities("111"))

    def test_activity_start_time(self):
        self.assertEqual(self.db.activity_start_time("888"), "Apr10")
        self.assertIsNone(self.db.activity_start_time("000"))

    def test_activity_end_time(self):
        self.assertEqual(self.db.activity_end_time("888"), "Apr11")
        self.assertIsNone(self.db.activity_end_time("000"))

コード例 #57

0

ファイルを表示

ファイル: Main.py プロジェクト: michaelchum/hackuoft

def main():
    dbc = DBController()
    #adbc = "jhfjsd"
    users = scrapeRFD(dbc)
    mailAll(users)

コード例 #58

0

ファイルを表示

ファイル: apkpure_crawler.py プロジェクト: munhyunsu/ApplicationPerformance

class Crawler:
    def __init__(self, is_desktop):
        """
        생성자
        is_desktop : 서버환경에서 실행시키는지, 데크스탑환경(GUI)에서 실행시키는지\
                     (true, false)
        """
        # config.ini파일의 변수 가져오기
        config = configparser.ConfigParser()
        config.read('config.ini')
        self.apk_directory = config.get('Setting', 'APK_DIRECTORY')
        os.makedirs(self.apk_directory, exist_ok=True)
        self.is_desktop = is_desktop

        # 서버모드로 실행시켰다면 가상디스플레이 실행
        chrome_options = webdriver.ChromeOptions()
        if (not is_desktop):
            self.display = Display(visible=0, size=(1024, 768))
            self.display.start()
            chrome_options.add_argument('--headless')

        # 크롬 드라이버 실행
        self.chrome = webdriver.Chrome(config.get('Setting',
                                                  'CHROME_DRIVER_DIRECTORY'),
                                       chrome_options=chrome_options)
        self.chrome.set_window_size(1024, 768)
        #self.chrome.set_page_load_timeout(30)

        # 크롤링할 디렉토리 리스트 저장
        self.category_list = config.items('PlayStoreURL')

        # 데이터를 저장하고 제어할 DBController객체 생성
        self.db_connector = DBController(config.get('Setting', 'DB_DIRECTORY'))

        # 메타데이터가 저장될 SQLite 테이블 생성
        self.db_connector.create_table()

    def __get_new_app_list(self, popular_url):
        """
        (private)
        입력받은 인기차트 url의 상위 300개 앱 메타데이터 수집
        popular_url : 특정 카테고리의 인기차트 URL
        """

        # 크롬 드라이버 url 이동 및 완료 대기
        self.chrome.get(popular_url)
        self.chrome.implicitly_wait(10)

        # 해당 페이지를 스크롤해야만 300위까지의 앱이 나타남
        for scroll in (10000, 20000, 30000, 40000, 50000):
            self.chrome.execute_script("window.scrollTo(0," \
                + str(scroll) + ");")
            time.sleep(2)

        package_list = []
        # selector를 사용해 300개의 앱 div를 가져옴
        div_app_list =  self.chrome.find_elements_by_css_selector(\
            ".card.no-rationale.square-cover.apps.small")

        # 300개의 div태그를 반복하면서 패키지 이름을 추출하여 리스트에 저장
        for div_app in div_app_list:
            app_detail = div_app.find_element_by_class_name('details')
            url = app_detail.find_element_by_class_name('title')\
                .get_attribute('href')
            package_name = url.split('id=')[1]
            package_list.append(package_name)

        #return package_list
        return package_list

    def __get_app_detail(self, package_list):
        """
        (priavte)
        패키지 리스트를 입력으로 받아 해당 패키지의 앱 이름, 이미지소스,\
        업데이트날짜, 별점 개수 (ratings)를 크롤링함
        """

        # 앱 상세정보 페이지에 들어가기위한 기본url
        # 뒤에 패키지 이름에 따라서 해당 앱 상세정보 페이지로 이동
        base_url = 'https://play.google.com/store/apps/details?id='
        detail_list = []

        for package in package_list:
            app_url = base_url + package

            # 크롬 드라이버 페이지 이동 및 완료 대기
            self.chrome.get(app_url)
            self.chrome.implicitly_wait(10)

            # 앱 이름, 이미지 소스, 최근 업데이트 날짜, 별점을 조회
            try:
                name = self.chrome.find_element_by_css_selector(
                    'h1[itemprop="name"]').text.strip()
            except:
                name = package
            try:
                img_src = self.chrome.find_element_by_css_selector(
                    'img[alt="Cover art"]').get_attribute('src')
            except:
                img_src = 'https://upload.wikimedia.org/wikipedia/en/4/48/Blank.JPG'
            try:
                updated_date = self.chrome.find_element_by_css_selector(
                    'span[class="htlgb"]').text.strip()
            except:
                updated_date = 'January 1, 2000'
            try:
                ratings = self.chrome.find_element_by_css_selector(
                    'meta[itemprop="ratingValue"]').get_attribute('content')
            except:
                ratings = -1

            # [앱 이름, 패키지 이름, 이미지 소스, 최신업데이트 날짜, 평점, APK다운 여부]
            print('FromPlayStore', name, package, img_src, updated_date,
                  ratings)
            detail_list.append(
                [name, package, img_src, updated_date, ratings, False])
            time.sleep(2)

        return detail_list

    def __download_apk(self, package_name, download_url):
        """
        (private)
        HTTP request를 통해 APK파일을 다운받음
        리퀘스트를 보내는 도중 에러가 발생하면 False반환
        정상적으로 파일이 저장완료되면 True반환
        package_name : 다운받으려는 패키지 이름
        download_url : HTTP request를 날리는 url 이름
        """
        file_name = str(package_name) + '.apk'

        # timout 1분으로 설정하여 반응이 없는 것들은 예외처리
        try:
            r = requests.get(download_url, timeout=60)
            # apk directory에 패키지이름.apk 형태로 저장
            with open(self.apk_directory + file_name, 'wb') as apk:
                apk.write(r.content)
        except requests.exceptions.Timeout as e:
            print('time out')
            return False
        except Exception as e:
            print(e)
            return False
        return True

    def crawl_new(self):
        """
        (public)
        카테고리 별 플레이스토어 인기차트 크롤링 및 DB 저장
        """
        # TODO: list ranomization needed
        for category in self.category_list:
            category_name = category[0]
            url = category[1]

            # 하나의 카테고리 인기차트에서 300개의 앱 패키지 이름 가져오기
            new_package_list = self.__get_new_app_list(url)

            # 카테고리의 300개 앱 패키지 이름으로 300개 앱 상세정보 수집
            # (앱 이름, 최신 업데이트 날짜, 이미지 소스, 레이팅)
            updated_app_list = self.__get_app_detail(new_package_list)

            # 300개의 앱 메타 데이터를 DB에 업데이트
            # 동일한 앱이 존재한다면 그대로 유지
            # 하지만 동일한 앱에도 업데이트가 존재한다면 메타정보 업데이트
            # 앱 이름이 DB에 없다면 새로 추가
            self.db_connector.update_app(updated_app_list, category_name)

        self.db_connector.commit_n_close()

    def crawl_old(self):
        """
        (public)
        기존 DB에 저장된 앱 메타데이터를 최신으로 업데이트
        """
        for category in self.category_list:
            category_name = category[0]
            url = category[1]

            # 기존 DB에 존재하던 카테고리별 패키지 리스트를 가져오기
            old_package_list = self.db_connector.get_old_category_app_list(
                category)

            # 기존 DB 메타데이터의 상세정보를 플레이스토어에서 크롤링
            updated_app_list = self.__get_app_detail(old_package_list)

            # 새로 생긴된 데이터들을 DB에 업데이트
            self.db_connector.update_app(updated_app_list, category_name)
        self.db_connector.commit_n_close()

    def update_apk(self):
        """
        DB에서 다운받지 않은 APK파일을 찾아 APK파일을 다운로드
        """

        # DB에서 아직 다운받지 않은 APK파일의 리스트를 가져옴
        not_updated_list = self.db_connector.not_updated_list()

        for package_row in not_updated_list:
            package_name = package_row[0]
            # apkpure.com에 패키지 이름으로 검색
            search_url = 'http://apkpure.com/search?q=' + package_name
            self.chrome.get(search_url)
            self.chrome.implicitly_wait(10)

            # 일치하는 앱이 검색되었는지 확인
            search_titles = self.chrome.\
                find_elements_by_class_name('search-title')

            # 검색결과가 없으면 apk를 다운받을 수 없으므로 통과
            if len(search_titles) == 0:
                print(package_name + " is not searched")
                continue

            # 검색결과와 일치하는 앱의 href 속성에서 다운로드 링크 추출
            # 검색결과가 여러개일 경우가 있으므로 패키지 이름으로 다시 확인
            link = ''
            for title in search_titles:
                link = title.find_element_by_tag_name('a')
                link = link.get_attribute('href')

                if package_name in link:
                    break

            # 검색결과가 여러개 나오지만 패키지명이 일치하지 않는다면 통과
            if link == '':
                print(package_name + ' is not searched in APKpure')
                continue

            # apk download링크로 이동
            self.chrome.get(link)
            self.chrome.implicitly_wait(10)

            a_list = self.chrome.find_elements_by_class_name(' down')
            try:
                for a in a_list:
                    link = a.get_attribute('href')
                    # href링크에 패키지 이름있는것이 있으면 발견!
                    if package_name in link:
                        self.chrome.get(link)
                        self.chrome.implicitly_wait(10)
                        break
                # 페이지 내부에 iframe을 못찾는 경우가 발생
                # 못찾는다면 해당 APK는 무시하고 다음APK로 이동
                iframe = self.chrome.find_element_by_id('iframe_download')

                src = iframe.get_attribute('src')
            except:
                print(package_name + " does not have href or iframe")
                continue

            # apk 파일 다운로드가 성공하면 db에 True로 저장, 실패시 False로 저장
            if (self.__download_apk(package_name, src)):
                self.db_connector.update_is_downloaded(package_name, True)
                print(package_name, 'downloaded')
            else:
                self.db_connector.update_is_downloaded(package_name, False)
                print(package_name, 'no-downloaded')
            time.sleep(2)

        self.db_connector.commit_n_close()

    def close(self):
        self.chrome.close()
        if (not self.is_desktop):
            self.display.stop()

コード例 #59

0

ファイルを表示

from DBController import DBController
from pymongo import MongoClient
from Member import Member
from Administrator import Administrator
from Activity import Activity

cluster = MongoClient(
    "mongodb+srv://wz:1999314Zwh%[email protected]/test?retryWrites=true&w"
    "=majority")
db = cluster["AMS"]
collection_member = db["Member"]
collection_admin = db["Administrator"]
collection_activity = db["Activity"]

db_controller = DBController(collection_member, collection_admin,
                             collection_activity)

# for i in range (0, 100):
#      member = Member(str(i), str(i), "*****@*****.**", "member")
#      db_controller.add_member(member)

# for i in range (0 ,100):
#      member = Member("updated name", str(i), "updated email", "updated password")
#      db_controller.update_member(member)

# for i in range (0 ,100):
#     db_controller.retrieve_member(str(i))

# for i in range (0 ,100):
#      db_controller.delete_member(str(i))

コード例 #60

0

ファイルを表示

ファイル: DataExtractor.py プロジェクト: exsonic/FatSecret_Crawler

class DataExtractor(object):
    def __init__(self):
        self.db = DBController()
        self.br = self.login()
    
    def login(self):
        br = Browser()
        cj = cookielib.LWPCookieJar()
        br.set_cookiejar(cj)
        
        br.set_handle_equiv(True)
        br.set_handle_redirect(True)
        br.set_handle_referer(True)
        br.set_handle_robots(False)
        br.set_handle_refresh(_http.HTTPRefreshProcessor(), max_time=2)
        
        br.open('http://www.fatsecret.com/Auth.aspx?pa=s')
        br.select_form(nr=0)
        #name attr of login tr
        #PLEASE input your username and password here!!!!
        br['_ctl0:_ctl7:Logincontrol1:Name'] = 'username'
        br['_ctl0:_ctl7:Logincontrol1:Password'] = '******'
        br.submit()
        return br
    
    #========================================================================================
    # URLType: 0 memberURL, 1 weightHistory, 2 dietHistory, 3 groups, 4 challenges, 5 buddies
    #========================================================================================
    def getURL(self, user, URLType):
        if URLType == 0:
            return 'http://fatsecret.com/member/' + '+'.join(user['name'].encode('utf-8', 'ignore').split())
        if user['serverId'] is None:
            return None
        elif URLType == 1:
            return 'http://www.fatsecret.com/Default.aspx?pa=memh&id=' + user['serverId']
        elif URLType == 2:
            return 'http://www.fatsecret.com/Diary.aspx?pa=mdcs&id=' + user['serverId']
        elif URLType == 3:
            return 'http://www.fatsecret.com/Default.aspx?pa=memgrps&id=' + user['serverId']
        elif URLType == 4:
            return 'http://www.fatsecret.com/Default.aspx?pa=memchals&id=' + user['serverId']
        elif URLType == 5:
            return 'http://www.fatsecret.com/Default.aspx?pa=memb&id=' + user['serverId']
        else:
            raise Exception('invalid URL type')
        
    def convertUserIdToUserList(self, userId):
        if userId is None or userId == []:
            return self.db.getAllUserList()
        elif isinstance(userId, list) and userId != []:
            userList = []
            for v in userId:
                user = self.db.getUserById(v)
                if user is not None:
                    userList.append(user)
            return userList
        elif isinstance(userId, int):
            user = self.db.getUserById(userId)
            return [user] if user is not None else []
        else:
            raise Exception('invalid input userId')
            
    def getServerId(self, userId=None):
        users = self.convertUserIdToUserList(userId)
        for user in users:
            if 'serverId' in user and user['serverId'] is not None:
                continue
            serverId = None
            try:
                memberURL = self.getURL(user, 0)
                page = self.br.open(memberURL)
                soup = BeautifulSoup(page.read())
                result = soup.find('div', attrs={'align' : 'right', 'class' : 'smallText', 'style' : 'padding-top:5px'})
                if result is not None:
                    for tag in result.contents:
                        if isinstance(tag, element.Tag) and 'href' in tag.attrs and tag.attrs['href'].find('id') != -1:
                            serverId = tag.attrs['href'].split('id=')[1]
                            break     
            except Exception as e:
                logException(user['id'], self.getServerId.__name__, e)
            finally:      
                self.db.updateServerId(user['id'], serverId)
    
    def getWeightHistory(self, userId=None):
        users = self.convertUserIdToUserList(userId)
        for user in users:
            diet, startWeight, goalWeight, weightHistory = None, None, None, None
            try:
                if user['serverId'] is not None:
                    weightHistoryURL = self.getURL(user, 1)
                    page = self.br.open(weightHistoryURL)
                    soup = BeautifulSoup(page.read())
                    tag = soup.find('b')
                    diet = tag.contents[1].text
                    tag = soup.find(attrs={'style' : 'padding:0px 10px'})
                    startWeight = float(tag.contents[1].split(': ')[1].split()[0])
                    goalWeight = float(tag.contents[0].text.split(': ')[1].split()[0])
                    weightList, dateList = [], []
                    for tag in soup.findAll(attrs={'class' : 'borderBottom date'}):
                        dateList.append(parser.parse(tag.text))
                    for tag in soup.findAll(attrs={'class' : 'borderBottom weight'}):
                        weightList.append(float(tag.text.split()[0]))
                    weightHistory = zip(dateList, weightList)
                    weightHistory = sorted(weightHistory, key= lambda record : record[0])
            except Exception as e:
                logException(user['id'], self.getWeightHistory.__name__, e)
            finally:
                self.db.updateWeightHistory(user['id'], diet, startWeight, goalWeight, weightHistory)
    
    def getDietHistory(self, userId=None):
        users = self.convertUserIdToUserList(userId)
        for user in users:
            dietHistory = None
            try:
                if user['serverId'] is not None:
                    dietHistoryURL = self.getURL(user, 2)
                    page = self.br.open(dietHistoryURL)
                    soup = BeautifulSoup(page.read())
                    months = soup.findAll('td', attrs={'colspan' : '6', 'class' : 'borderBottom'})
                    monthList = []
                    if months == []:
                        raise Exception('no diet history records')
                    for month in months:
                        monthList.append(datetime.strptime(month.text, '%B %Y'))
                    rows = soup.findAll('tr', attrs={'valign' : 'middle'})
                    prevDay = 32
                    monthIndex = 0
                    dietHistory = []
                    for row in rows:
                        try:
                            if len(row.contents) != 13:
                                continue
                            day = int(re.sub('[^0-9]', '', row.contents[1].text))
                            if day >= prevDay:
                                monthIndex += 1 
                            prevDay = day
                            date = datetime(monthList[monthIndex].year, monthList[monthIndex].month, day)
                            food = self.getIntFromRawString(row.contents[3].text)
                            RDI = self.getDecimalFromPercentageString(row.contents[5].text)
                            fat, protein, carbs = self.getDataFromNutrionalSummary(row.contents[7].text)
                            exercise = self.getIntFromRawString(row.contents[9].text)
                            net = self.getIntFromRawString(row.contents[11].text)
                            dietHistory.append((date, food, RDI, fat, protein, carbs, exercise, net))
                        except Exception as e:
                            logException(user['id'], self.getDietHistory.__name__, e, 'scrape row error')
                    if 'dietHistory' in user and user['dietHistory'] is not None:
                        dietHistory = self.mergeDietTrack(user['dietHistory'], dietHistory)
                    else:
                        dietHistory.sort(key=lambda item : item[0])
            except Exception as e:
                logException(user['id'], self.getDietHistory.__name__, e)
            finally:
                self.db.updateDietHistory(user['id'], dietHistory)
    
    def getGroup(self, userId=None):
        users = self.convertUserIdToUserList(userId)
        for user in users:
            groupIdList = []
            try:
                if user['serverId'] is not None:
                    groupURL = self.getURL(user, 3)
                    page = self.br.open(groupURL)
                    soup = BeautifulSoup(page.read())
                    results = soup.findAll('td', attrs={'width' : '50', 'align' : 'center'})
                    for tag in results:
                        groupName =  tag.contents[1].attrs['title']
                        group = self.db.addNewGroup(groupName)
                        self.db.addUserInGroup(user['id'], group['id'])
                        groupIdList.append(group['id'])
            except Exception as e:
                logException(user['id'],self.getGroup. __name__, e)
            finally:
                self.db.addGroupInUser(user['id'], groupIdList)
    
    def getChallenge(self, userId=None):
        users = self.convertUserIdToUserList(userId)
        for user in users:
            challengeIdList = []
            try:
                if user['serverId'] is not None:
                    challengeURL = self.getURL(user, 4)
                    page = self.br.open(challengeURL)
                    soup = BeautifulSoup(page.read())
                    results = soup.findAll('td', attrs={'width' : '50', 'align' : 'center'})
                    for tag in results:
                        challengeName = tag.contents[1].attrs['title']
                        challenge = self.db.addNewChallenge(challengeName)
                        self.db.addUserInChallenge(user['id'], challenge['id'])
                        challengeIdList.append(challenge['id'])
            except Exception as e:
                logException(user['id'], self.getChallenge.__name__, e)
            finally:
                self.db.addChallengeInUser(user['id'], challengeIdList)
    
    def getBuddy(self, userId=None):
        users = self.convertUserIdToUserList(userId)
        for user in users:
            buddyIdList = []
            try:
                if user['serverId'] is not None:
                    buddyURL = self.getURL(user, 5)
                    while True:
                        page = self.br.open(buddyURL)
                        soup = BeautifulSoup(page.read())
                        results = soup.findAll('a', attrs={'class' : 'member', 'onmouseout' : 'hideTip()'})
                        for tag in results:
                            if tag.text != '':
                                buddyName = tag.text.strip()
                                buddy = self.db.addNewUser(buddyName)
                                buddyIdList.append(buddy['id'])
                                if 'serverId' not in buddy:
                                    self.getServerId(buddy['id'])
                        result = soup.find('span', attrs={'class' : 'next'})
                        if result is None:
                            break
                        else:
                            buddyURL = 'http://fatsecret.com/' + result.contents[0].attrs['href']
            except Exception as e:
                logException(user['id'], self.getBuddy.__name__, e)
            finally:
                self.db.addBuddyInUser(user['id'], buddyIdList)
    
    def mergeDietTrack(self, oldTrack, newTrack):
        oldTrack, newTrack = sorted(oldTrack, key= lambda item : item[0]), sorted(newTrack, key= lambda item: item[0])
        i = 0
        for item in oldTrack:
            if item[0] >= newTrack[0][0]:
                break
            i += 1
        return oldTrack[0 : i] + newTrack
    
    def cleanNonNumercial(self, dataString):
        return re.sub('[^0-9.]', '', dataString.strip())
    
    def getIntFromRawString(self, dataString):
        dataString = self.cleanNonNumercial(dataString)
        return int(dataString) if dataString != '' else None
    
    def getDataFromNutrionalSummary(self, dataString):
        if dataString.strip() == '':
            return None, None, None
        fat = float(dataString.split('fat: ')[1].split('g')[0])
        protein = float(dataString.split('protein: ')[1].split('g')[0])
        carbs = float(dataString.split('carbs: ')[1].split('g')[0])
        return fat, protein, carbs
    
    def getDecimalFromPercentageString(self, dataString):
        dataString = self.cleanNonNumercial(dataString)
        return float(self.cleanNonNumercial(dataString)) / 100 if dataString != '' else None