def __btn_callback(self, btn): if btn == "Entrar": # db_controller = DBController(host="ds133856.mlab.com", port=33856, db_name="pooptbank", db_user="******", # db_pass="******") db_controller = DBController(host="localhost", port=27017, db_name="pooptbank", db_user="", db_pass="") username = self.get_app_gui().getEntry("Username") password = self.get_app_gui().getEntry("Password") data = db_controller.select_data_single('users', {'username': username, 'password': password}) if data is not None and data['username'] == username and data['password'] == password: home_view = HomeView(View.get_app_gui(self), "Home", data['username'], str(data['balance'])) home_view.show("Home") else: error_view = ErrorView(View.get_app_gui(self), "Erro") error_view.show("Erro") elif btn == "Sair": View.get_app_gui(self).stop() elif btn == "Cadastrar": signup_view = None try: signup_view = Signup(View.get_app_gui(self), btn) signup_view.show(btn) except ItemLookupError: signup_view.show(btn) elif btn == "Configurar": config_view = Configuration(View.get_app_gui(self), btn) config_view.show(btn)
def setUp(self): self.database = mongomock.MongoClient().db post1 = { "_id": "393", "name": "software", "email_address": "*****@*****.**", "password": "******" } self.collection_admin = self.database.create_collection( "Administrator") self.collection_admin.insert_one(post1) post2 = { "_id": "123", "name": "Terry", "email_address": "*****@*****.**", "password": "******" } self.collection_member = self.database.create_collection("Member") self.collection_member.insert_one(post2) post3 = { "_id": "888", "name": "meeting", "start_time": "Apr10", "end_time": "Apr11", "location": "case" } self.collection_activity = self.database.create_collection("Activity") self.collection_activity.insert_one(post3) self.db = DBController(self.collection_member, self.collection_admin, self.collection_activity)
def __seed_books(cls): out_path = 'phy-books/out' data_path = out_path + '/articles_books.json' if not os.path.exists(out_path): os.makedirs(out_path) if not os.path.isfile(data_path): print('Resource books does not exist! Сreation is in progress...') with open('phy-books/phy_books.json', 'r', encoding='utf8') as fh: # собранные с сайта МИФ данные books = json.load(fh) book_fetcher = BooksFetcher(books) phy_books = book_fetcher.create_phy_book() books_list = [] for obj in phy_books: books_list.append(obj.serialize()) with open(data_path, 'w+', encoding='utf8' ) as file: # сереализованные обьекты PhyBooks json.dump(books_list, file, indent=2) print('Resource created') with open(data_path, 'r', encoding='utf8') as data_file: books = json.load(data_file) for index, book in enumerate(books): phy_book = PhyBook(book) print( f'add {index + 1} of the {len(books)} books: {phy_book.title}' ) if phy_book is not None: DBController.add_document(phy_book, str(uuid.uuid4()))
def fillMissingValue(self, featureDict): db = DBController() for k,v in featureDict.iteritems(): if v is None: allFeatureList = db.getAllFeatureListBySong(featureDict['id']) valueList = [] for featureVector in allFeatureList: if featureVector[k] is not None: valueList.append(featureVector[k]) if len(valueList) != 0: featureDict[k] = sum(valueList) / float(len(valueList)) else: if k == 'radio' or k == 'streaming' or k == 'sales': featureDict[k] = featureDict['rank'] elif k == 'MVView': featureDict[k] = 7 elif k == 'MVSocialInteraction': featureDict[k] = 4 elif k == 'MTVReviewCount': featureDict[k] = 10 elif k == 'MTVReviewScore': featureDict[k] = 0 elif k == 'youtubeCommentCount': featureDict[k] = 100 elif k == 'youtubeCommentScore': featureDict[k] = 3 elif k == 'twitterCount': featureDict[k] = 100 elif k == 'twitterScore': featureDict[k] = 0 return featureDict
def __init__(self, is_desktop): """ 생성자 is_desktop : 서버환경에서 실행시키는지, 데크스탑환경(GUI)에서 실행시키는지\ (true, false) """ # config.ini파일의 변수 가져오기 config = configparser.ConfigParser() config.read('config.ini') self.apk_directory = config.get('Setting','APK_DIRECTORY') self.is_desktop = is_desktop # 서버모드로 실행시켰다면 가상디스플레이 실행 if(not is_desktop): self.display = Display(visible=0, size=(800, 600)) self.display.start() # 크롬 드라이버 실행 self.chrome = webdriver.Chrome(config.get('Setting',\ 'CHROME_DRIVER_DIRECTORY')) # 크롤링할 디렉토리 리스트 저장 self.category_list = config.items('PlayStoreURL') # 데이터를 저장하고 제어할 DBController객체 생성 self.db_connector = DBController(config.get('Setting','DB_DIRECTORY')) # 메타데이터가 저장될 SQLite 테이블 생성 self.db_connector.create_table()
def __init__(self, is_desktop): """ 생성자 is_desktop : 서버환경에서 실행시키는지, 데크스탑환경(GUI)에서 실행시키는지\ (true, false) """ # config.ini파일의 변수 가져오기 config = configparser.ConfigParser() config.read('config.ini') self.apk_directory = config.get('Setting', 'APK_DIRECTORY') os.makedirs(self.apk_directory, exist_ok=True) self.is_desktop = is_desktop # 서버모드로 실행시켰다면 가상디스플레이 실행 chrome_options = webdriver.ChromeOptions() if (not is_desktop): self.display = Display(visible=0, size=(1024, 768)) self.display.start() chrome_options.add_argument('--headless') # 크롬 드라이버 실행 self.chrome = webdriver.Chrome(config.get('Setting', 'CHROME_DRIVER_DIRECTORY'), chrome_options=chrome_options) self.chrome.set_window_size(1024, 768) #self.chrome.set_page_load_timeout(30) # 크롤링할 디렉토리 리스트 저장 self.category_list = config.items('PlayStoreURL') # 데이터를 저장하고 제어할 DBController객체 생성 self.db_connector = DBController(config.get('Setting', 'DB_DIRECTORY')) # 메타데이터가 저장될 SQLite 테이블 생성 self.db_connector.create_table()
def computeBaseLine(self, baselineType=0): iterWeek, endWeek = datetime(2013, 3, 23), datetime(2013, 4, 20) db = DBController() fg = FeatureGenerator() baselineScore = 0 while iterWeek <= endWeek: lastWeek = iterWeek - timedelta(weeks=1) featureList = db.getFeatureListByWeek(iterWeek) y_pred, y_test = [], [] for featureVector in featureList: songId = featureVector["id"] lastWeekRank = db.getTop50Rank(lastWeek, songId) if lastWeekRank is None: lastWeekScore = 0 else: lastWeekScore = fg.rankToPopScore(lastWeekRank) currentWeekRank = featureVector["rank"] currentWeekScore = fg.rankToPopScore(currentWeekRank) if currentWeekRank is not None else lastWeekScore y_pred.append(lastWeekScore) y_test.append(currentWeekScore) y_pred, y_test = self.getRankArray(numpy.asarray(y_pred)), self.getRankArray(numpy.asarray(y_test)) if baselineType == 0: baselineScore += self.getRankEvalationScore(y_pred, y_test) elif baselineType == 1: baselineScore += metrics.r2_score(y_pred, y_test) else: baselineScore += metrics.mean_squared_error(y_pred, y_test) iterWeek += timedelta(weeks=1) baselineScore = baselineScore / 5 print baselineScore
def extractReviewsToBD(self, songList): db = DBController() for i, song in enumerate(songList): print i try: review = self.extractReviewFromMTV(song) db.insertMTVReviewToDB(song["id"], review) except Exception as e: print e continue
def extractDataToDB(self, songList): db = DBController() for song in songList: try: URL = self.getURL(song['title'], song['artist']) viewStatDataList, socialInteractionDataList, detailStatDataDict = self.extractDataFromIMVDB(URL) db.insertIMVDBDataToDB(song['id'], viewStatDataList, socialInteractionDataList, detailStatDataDict) except Exception as e: print e continue
def loadBrokerageToDB(filePath): with open(filePath, 'rU') as f: db = DBController() reader = csv.reader(f) for i, line in enumerate(reader): if i == 0: continue line = [word.strip() for word in line] brokerageDict = {'_id' : line[1], 'name' : line[0], 'code' : line[2]} db.saveBrokerage(brokerageDict)
class SentenceClusterer(object): def __init__(self): self.db = DBController() self.clusterer = KMeans(n_clusters=TOTAL_CLUSTER) def train(self, X): self.clusterer.fit(X) def predict(self, X): return self.clusterer.predict(X) def updateSentenceCluster(self, clusterList, sentenceIdList): for sentenceId, cluster in zip(sentenceIdList, clusterList): self.db.updateSentenceCluster(sentenceId, int(cluster)) def clusterSentenceInBatch(self, startId=0, limit=5000): endId, lastId = startId + BATCH_SIZE, startId + limit while endId < lastId: sentences = self.db.getSentenceInRange(startId, endId) self.clusterSentence(sentences) startId += BATCH_SIZE endId += BATCH_SIZE def clusterSentence(self, sentences): sentenceMatrix, sentenceIdList = self.getSentenceMatrixAndIdList(sentences) self.train(sentenceMatrix) clusterList = self.predict(sentenceMatrix) self.updateSentenceCluster(clusterList, sentenceIdList) def getSentenceMatrixAndIdList(self, sentences): table = self.db.getUnigramTable() matrix, idList = [], [] i = 0 for sentence in sentences: print i i += 1 wordList = getProcessedWordList(sentence['content']) vector = self.getSentenceVector(table, wordList) matrix.append(vector) idList.append(sentence['_id']) return numpy.array(matrix), idList def getSentenceVector(self, table, wordList): wordIndexList = [table[word] for word in wordList] wordIndexList.sort() vector = [0] * len(table) for index in wordIndexList: vector[index] += 1 return vector # if __name__ == '__main__': # sc = SentenceClusterer() # sc.clusterSentenceInBatch(30000, 1000)
def extractYoutubeCommentsToDB(self, songList): db = DBController() for song in songList: try: searchVideoName = song['title'] + ' ' + song['artist'] videoID = self.getVideoID(searchVideoName) comments = self.getComments(videoID) db.insertCommentToDB(song['id'], comments) except Exception as e: print e continue
def outputTopX(self, songIdList, rankList, x=10): if x > 40: raise Exception("x must be less than 40") db = DBController() songList = [] for i in range(1, x + 1): try: index = rankList.index(i) songId = songIdList[index] songList.append(db.getSongById(songId)) except: continue return songList
async def download_article(self, url: str, sem) -> PhyWebArticle: async with sem: article_html = await self.load_html(url) if len(article_html) > 0: article = self.parse_html(url, article_html) if len(article.normalized_words) == 0: print(f'url {url} PARSE ERR') article = None else: article = None if article is not None: DBController.add_document(article, str(uuid.uuid4()))
def __getitem__(self, key): if isinstance(key, slice): return [ BaseArticle(DBController.get_article(self.articles_id[ii])) for ii in range(*key.indices(len(self))) ] elif np.issubdtype(type(key), np.integer): if key < 0: key += len(self) if key < 0 or key >= len(self): raise IndexError("The index {} is out of range.".format(key)) return BaseArticle(DBController.get_article(self.articles_id[key])) else: raise TypeError("Invalid argument type.")
def __init__(self, is_desktop): config = configparser.ConfigParser() config.read('config.ini') self.apk_directory = config.get('Setting', 'APK_DIRECTORY') self.is_desktop = is_desktop if (not is_desktop): self.display = Display(visible=0, size=(800, 600)) self.display.start() self.chrome = webdriver.Chrome(config.get('Setting',\ 'CHROME_DRIVER_DIRECTORY')) self.category_list = config.items('PlayStoreURL') self.db_connector = DBController(config.get('Setting', 'DB_DIRECTORY')) self.db_connector.create_table()
def main(): print "Welcome to stats importer!" parser = argparse.ArgumentParser() parser.add_argument("--schema", help="Schema name", default="ootp_players") parser.add_argument("--config", help="Json config file") parser.add_argument("--playerFile", help="Player file to import") args=parser.parse_args() BaseModel.TheDatabase.init(args.schema, user='******', password='******') BaseModel.TheDatabase.connect() dbController = DBController() dbController.checkInit() importer = PlayersImporter(LeagueConsts(args.config)) #importer = StatsImporter(2014) importer.doImport(args.playerFile)
def extractSalesRankToDB(self, beginDate=datetime.today(), endDate=datetime.today()): if beginDate < datetime(2007, 1, 1) or endDate > datetime.today(): raise Exception('Invalid input date!') beginDate = dateToSaturday(beginDate) endDate = dateToSaturday(endDate) endDate = endDate - timedelta(days=7) if endDate > datetime.today() else endDate iterDate = beginDate db = DBController() while iterDate <= endDate: if db.checkSalesRankExistInDB(iterDate): iterDate = iterDate + timedelta(days = 7) continue URL = self.getURL(iterDate) chart = self.getSalesChartFromURL(URL) db.insertSalesChartToDB(iterDate, chart) iterDate = iterDate + timedelta(days = 7)
def __init__(self, taskQueue, resultQueue, *args): super(ProcessThread, self).__init__() self._taskQueue = taskQueue self._resultQueue = resultQueue self._args = args self._executeFunction = None self._db = DBController()
def loadCompeletedCodingFile(filePath): db = DBController() with open(filePath, 'rU') as f: reader = csv.reader(f) keyList = ['_id', 'OC_ID', 'OUTCOME_ID', 'CAUSE_ID', 'PR_ID', 'NAME', 'OUTCOME', 'FAVORABILITY', 'CAUSE', 'LOCUS_CAUSALITY', 'CONTROLLABILITY'] for i, line in enumerate(reader): if i == 0: continue try: line[7] = int(line[7]) line[9] = int(line[9]) line[10] = int(line[10]) sentenceDict = dict(zip(keyList ,line)) db.saveCompletedSentence(sentenceDict) except: pass
def check_db_status(): db_len = 0 for _ in DBController.get_all_articles(): db_len += 1 if db_len == 0: print('Seeding database...') DatabaseSeeder.seed()
def loadAllRTFToDB(folderPath): db = DBController() for dirPath, dirNames, fileNames in os.walk(folderPath): for fileName in fileNames: if not fileName.endswith('.rtf'): continue filePath = os.path.join(dirPath, fileName) print(filePath) try: doc = Rtf15Reader.read(open(filePath)) text = PlaintextWriter.write(doc).getvalue() except: continue lines = [line.strip() for line in text.split('\n') if line] articleLinesDict, articleStartIndex = {}, 0 for i, line in enumerate(lines): if line.startswith('Document ') and len(line.split(' ')) == 2: articleId = line.split(' ')[-1] articleLinesDict[articleId] = lines[articleStartIndex : i] articleStartIndex = i + 1 for articleId, lines in articleLinesDict.iteritems(): bylineIndex, wordCountIndex, textStartIndex = -1, -1, -1 for i, line in enumerate(lines): line = line.lower() if line.startswith('by '): bylineIndex = i elif line.endswith(' words'): wordCountIndex = i elif line == 'english': textStartIndex = i + 2 if wordCountIndex == -1 or textStartIndex == -1 or wordCountIndex > textStartIndex: print(filePath + ', ' + articleId) else: articleDict = {'_id': articleId, 'filePath' : filePath.split('Marshall_RA/')[-1], 'headline': ' '.join(lines[: wordCountIndex]) if bylineIndex == -1 else ' '.join(lines[: bylineIndex]), 'byline' : '' if bylineIndex == -1 else lines[bylineIndex], 'date' : parser.parse(lines[wordCountIndex + 1]), 'sourceName' : lines[wordCountIndex + 2] if lines[wordCountIndex + 2].find(' AM') == -1 and lines[wordCountIndex + 2].find(' PM') == -1 else lines[wordCountIndex + 3], 'leadParagraph' : '', 'tailParagraph' : '\n'.join(lines[textStartIndex:]), 'sourceCode' : '', 'industry' : [], 'region' : [], 'newsSubject' : [], 'company' : []} db.saveArticle(articleDict)
def loadAllXMLtoDB(inputDir): #have folder and p, pa info, insert after get db = DBController() for dirName, _, fileNames in os.walk(inputDir): print(dirName) for fileName in fileNames: try: if not fileName.endswith('xml'): continue fileAbsPath = getAbsPath(dirName, fileName) for articleDict in parseArticleFromXML(fileAbsPath): #duplication check # if db.isArticleDuplicate(articleDict['tailParagraph']): # continue articleDict['filePath'] = fileAbsPath.split('Marshall_RA/')[1] db.saveArticle(articleDict) except Exception as e: print e, dirName, fileName
def getFeatureMatrix(self, beginWeek, endWeek=datetime.today(), mode=0, withSongId=False): if beginWeek < datetime(2007, 1, 1) or endWeek > datetime.today(): raise Exception('Invalid input date!') beginWeek, endWeek = dateToSaturday(beginWeek), dateToSaturday(endWeek) endWeek = endWeek - timedelta(days=7) if endWeek > datetime.today() else endWeek iterWeek = beginWeek db = DBController() matrix = [] while iterWeek <= endWeek: featureList = db.getFeatureListByWeek(iterWeek) for featureDict in featureList: featureVector = self.featureDictToList(featureDict, mode, withSongId) if featureVector is None: continue else: matrix.append(featureVector) iterWeek += timedelta(weeks=1) matrix = numpy.matrix(matrix) return matrix
def save(self): try: db_controller = DBController() address = BaseUser.get_address(self) query = db_controller.insert_data( 'users', { 'username': BaseUser.get_name(self), 'password': BaseUser.get_password(self), 'address': address.get_street(), 'cpf': BaseUser.get_cpf(self), 'admin': False, 'balance': 0 }) if query is None: return False else: return True except: return False
def load_model(cls, path: str, model_name: str, model_type: str) -> BaseModel: print(f'{model_name}.{model_type} model loading...') if model_type != 'd2v': dictionary = corpora.Dictionary.load( os.path.join(path, f'{model_name}.dict')) corpus = corpora.MmCorpus(os.path.join(path, f'{model_name}.mm')) if model_type == 'ft': similarity_matrix = sparse.load_npz( os.path.join(path, f'{model_name}.mat.npz')) articles_id = cls.load_articles_id(path) articles = DBController.get_all_articles( {'serial_id': { '$in': articles_id }}) training_sample = TrainingSample(articles) def load_func(model_path: str, model_type: str): if model_type == 'lsi': model = models.lsimodel.LsiModel.load(model_path) return LsiModel.trained(name=model_name, model=model, corpus=corpus, dictionary=dictionary, training_sample=training_sample) elif model_type == 'lda': model = models.ldamodel.LdaModel.load(model_path) return LdaModel.trained(name=model_name, model=model, corpus=corpus, dictionary=dictionary, training_sample=training_sample) elif model_type == 'd2v': model = models.doc2vec.Doc2Vec.load(model_path) return D2vModel.trained(name=model_name, model=model, corpus=None, dictionary=None, training_sample=training_sample) elif model_type == 'ft': model = models.FastText.load(model_path) # similarity_matrix = sparse.load_npz(os.path.join(path, f'{model_name}.mat.npz')) return FastTextModel.trained( name=model_name, model=model, corpus=corpus, dictionary=dictionary, similarity_matrix=similarity_matrix, training_sample=training_sample) model = load_func(os.path.join(path, f'{model_name}.{model_type}'), model_type=model_type) print('Loaded') return model
def loadPRFiles(folderPath): db = DBController() for dirPath, dirNames, fileNames in os.walk(folderPath): for fileName in fileNames: if not fileName.endswith('TXT.txt'): continue filePath = os.path.join(dirPath, fileName) fileNameParts = fileName.split('.')[0].split('_') articleDict = {'_id':fileName.split('.')[0], 'code' : fileNameParts[0], 'year' : int(fileNameParts[1]), 'quarter' : fileNameParts[2]} with open(filePath, 'rU') as f: articleDict['text'] = ('\n '.join(f.readlines())).decode('utf-8', 'ignore') try: db.savePRArticle(articleDict) except: pass #if __name__ == '__main__': #loadCompeletedCodingFile('Corpus/completed-coding.csv') #loadPRFiles('/Users/exsonic/Dropbox/Marshall_RA/ENRON/SP500_PR_1999_2004')
def __init__(self, taskQueue, resultQueue, *args): super(DataProcessorThread, self).__init__() self._taskQueue = taskQueue self._resultQueue = resultQueue self._args = args self._executeFunction = None self._db = DBController() self._citeWordList = getWordList(WORD_CITE) if not os.path.exists('export/'): os.makedirs('export/')
class PyPass: def __init__(self): self.__dbController = DBController('localhost', 27017) def set_username(self, username): hashedUsername = SecurityManager.get_hash(username) self.__username = hashedUsername self.__dbController.set_username(hashedUsername) def store_entry(self, id, ciphered_user, ciphered_pass): new_entry = Entry(id, ciphered_user, ciphered_pass) self.__dbController.storeEntry(new_entry) def get_entry(self, identifier): entry = self.__dbController.getEntry(SecurityManager.get_hash(identifier)) username = self.__securityManager.decipher_field(entry.username) password = self.__securityManager.decipher_field(entry.password) return {'identifier': identifier, 'username': username, 'password': password} def create_entry(self, id, username, password): ciphered_user = self.__securityManager.cipher_field(username) ciphered_password = self.__securityManager.cipher_field(password) hashed_id = SecurityManager.get_hash(id) self.store_entry(hashed_id, ciphered_user, ciphered_password) def generate_new_entry(self, id, username): ciphered_user = self.__securityManager.cipher_field(username) ciphered_pass = self.__securityManager.generate_password() ciphered_id = self.__securityManager.cipher_field(id) self.store_entry(ciphered_id, ciphered_user, ciphered_pass) def create_account(self, username, masterPass): hashedUsername = SecurityManager.get_hash(username) hashedMasterPass = SecurityManager.get_hash(SecurityManager.get_hash(masterPass)) return self.__dbController.createUser(hashedUsername, hashedMasterPass) def auth_user(self, username, masterPass): hashedUsername = SecurityManager.get_hash(username) hashedMasterPass = SecurityManager.get_hash(SecurityManager.get_hash(masterPass)) if self.__dbController.authUser(hashedUsername, hashedMasterPass): self.__dbController.set_username(hashedUsername) self.__securityManager = SecurityManager(masterPass) return True else: return False
def initialize_pool(self): #todo:需要另一队列来监控正在使用的connection self.pool = Queue.Queue(maxsize=self.max_pool_size) if self.conn_at_start: for _ in range(0, self.max_pool_size): self.pool.put_nowait( DBController(host=self.conf["host"], db_name=self.conf["db_name"], db_user_name=self.conf["db_user_name"], psd=self.conf["db_psw"], port=self.conf["port"])) self.current_conn_size += 1
def __init__(self): self.db = DBController() self.pfmWord = getWordList(WORD_PFM) self.posWord = getWordList(WORD_POS) self.negWord = getWordList(WORD_NEG) self.exWord = getWordList(ATRB_EX) self.inWord = getWordList(ATRB_IN) self.citeWord = getWordList(WORD_CITE) self.engagerList = list(self.db.getAllEngager()) self.companyList = list(self.db.getAllCompany()) self.engagerRegexPatternDict, self.companyRegexPatternDict = self.getRegexPatternDictForEngagerAndCompany()
class Menu: def __init__(self, bot): self.state = 0 self.bot = bot self.db = DBController() self.mainTemplate = "" def initialMessage(self): today = datetime.today() reminders = self.db.getReminders(where='''DAY(reminder_datetime)={} AND MONTH(reminder_datetime)={} AND YEAR(reminder_datetime)={} ORDER BY reminder_datetime'''.foramt(str(today.day), str(today.month), str(today.year))) tasks = self.db.getTasks() text = "" startHour = 5 startMinute = '0' if reminders[0][2].hours > 5: startHour = reminders[0][2].hours for i in range((24 - startHour) * 2): s = '{}:{}0 {} \n'.format( str(startHour), startMinute, ) self.bot.send_message(config.MAIN_USER_ID, text, disable_notification=False) async def send_message(self, user_id: int = config.MAIN_USER_ID, text: str = 'Hi', disable_notification: bool = False) -> bool: try: await self.bot.send_message( user_id, text, disable_notification=disable_notification) except: return True return False
def __init__(self, app, title, user, balance): SubWindow.__init__(self, app, title) SubWindow.set_size(self, "500x400") SubWindow.add_label(self, "Bem vindo," + user + " .") SubWindow.add_btns( self, ["Historico", "Trocas", "Transferencias", "Deposito", "Logout"], self.__btn_callback) SubWindow.add_label(self, "Seu saldo é " + balance) SubWindow.add_label(self, "Sua ultima transação foi ...") db_controller = DBController() all_users = db_controller.select_data("users") all_users_array = [] for obj in all_users: all_users_array.append(obj['username']) self.historicoView = HistoricoView(SubWindow.get_app_gui(self), "Historico") self.trocasView = TrocasView(SubWindow.get_app_gui(self), "Trocas") self.transferenciasView = TransferenciasView( SubWindow.get_app_gui(self), "Transferencias", all_users_array, user) self.depositoView = DepositoView(SubWindow.get_app_gui(self), "Deposito")
def execute(self): try: db_controller = DBController() user1_balance = db_controller.select_data_single( "users", {"username": self.__user1})['balance'] user2_balance = db_controller.select_data_single( "users", {"username": self.__user2})['balance'] db_controller.update_data( "users", {'username': self.__user1}, {'balance': user1_balance - self.get_valor()}) db_controller.update_data( "users", {'username': self.__user2}, {'balance': user2_balance + self.get_valor()}) return True except: return False
def __seed_pdf_articles(cls): data_path = 'Resources/pdf_articles.json' if not os.path.isfile(data_path): print('Resource does not exist!') return with open(data_path, 'r', encoding='utf8') as data_file: data = json.load(data_file) for index, article_data in enumerate(data): title = article_data['title'] text = article_data['text'] normalized_words = TextNormalizer.normalize(text) article = PhyPdfArticle({ **article_data, 'lang': 'en', 'normalized_words': normalized_words }) print(f'add {index + 1} of the {len(data)} articles: {title}') if article is not None: DBController.add_document(article, str(uuid.uuid4()))
def get_dbc(self): # returns a db instance when one is available else waits until one is if not self.conn_at_start and self.current_conn_size < self.max_pool_size: dbc = DBController(host=self.conf["host"], db_name=self.conf["db_name"], db_user_name=self.conf["db_user_name"], psd=self.conf["db_psw"], port=self.conf["port"]) if not dbc: print "cannot generate dbccontroller" return None self.current_conn_size += 1 dbc.new_cur() return dbc dbc = self.pool.get(True) dbc.new_cur() return dbc
class DataProcessorThread(Thread): def __init__(self, taskQueue, resultQueue, *args): super(DataProcessorThread, self).__init__() self._taskQueue = taskQueue self._resultQueue = resultQueue self._args = args self._executeFunction = None self._db = DBController() self._citeWordList = getWordList(WORD_CITE) if not os.path.exists('export/'): os.makedirs('export/') def exportSentenceAnalysis(self): #sentence collection is all the sentence #deprecated, need to refactor and apply queue with open('export/sentence.csv', 'wb') as f: writer = csv.writer(f) sentences = self._db.getAllSentence() articleDict = {} attributeList = ['id', 'cotic', 'coname', 'filePath', 'accessionNo', 'content', 'coname','ceoname', 'cite', 'co_c', 'ceo_c', 'analyst_c', 'pfm', 'pfm_words', 'pos', 'pos_words', 'neg', 'neg_words', 'internal', 'int_words', 'external', 'ext_words', 'quote_sen', 'analyst'] writer.writerow(attributeList) for i, sentence in enumerate(sentences): try: print(i) if sentence['articleId'] not in articleDict: articleDict[sentence['articleId']] = self._db.getArticleById(sentence['articleId']) article = articleDict[sentence['articleId']] articlePathPartList = article['filePath'].split('/') articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2] articleCompany = self._db.getCompanyByCode(articleCompanyCode) articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name'] sentenceCompanyList = [self._db.getCompanyById(companyId) for companyId in sentence['company']] sentenceCompanyNameString = ','.join([company['shortName'] for company in sentenceCompanyList]) sentenceEngagerList = [self._db.getEngagerById(engagerId) for engagerId in sentence['engager']] CEOList = filter(lambda engager : engager['type'] == ENGAGER_CEO, sentenceEngagerList) analystList = filter(lambda engager : engager['type'] == ENGAGER_ANALYST, sentenceEngagerList) CEONameString = ','.join([CEO['lastName'] for CEO in CEOList]) citeWordString = ','.join(sentence['cite']) citeCompany, citeCEO, citeAnalyst = int(sentence['citeCompany']), int(sentence['citeCEO']), int(sentence['citeAnalyst']) pfmWordString = ','.join(sentence['pfm']) posWordString = ','.join(sentence['pos']) negWordString = ','.join(sentence['neg']) inWordString = ','.join(sentence['in']) exWordString = ','.join(sentence['ex']) quoteString = getQuotedString(sentence['content']) analystSurroundString = getStringSurroundWordInDistance(sentence['content'], 'analyst', ANALYST_SURROUND_DISTANCE) lineList = [sentence['_id'], articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], sentence['content'].encode('utf-8'), sentenceCompanyNameString, CEONameString, citeWordString, citeCompany, citeCEO, citeAnalyst, len(sentence['pfm']), pfmWordString, len(sentence['pos']), posWordString, len(sentence['neg']), negWordString, len(sentence['in']), inWordString, len(sentence['ex']), exWordString, quoteString, analystSurroundString] writer.writerow(lineList) except Exception as e: print(e) def exportArticleAnalysis(self): #deprecated with open('export/article.csv', 'wb') as f: writer = csv.writer(f) articleList = list(self._db.getAllArticle()) attributeList = ['cotic', 'coname', 'filePath', 'accessNo', 'date', 'source', 'byline', 'coname1', 'coname2', 'coname3', 'coname4', 'coname5', 'subjectCode1', 'subjectCode2', 'subjectCode3', 'subjectCode4', 'subjectCode5'] writer.writerow(attributeList) for i, article in enumerate(articleList): try: print(i) articlePathPartList = article['filePath'].split('/') articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2] articleCompany = self._db.getCompanyByCode(articleCompanyCode) articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name'] companyCodeList = [''] * ARTICLE_EXPORT_CODE_SIZE subjectCodeList = [''] * ARTICLE_EXPORT_CODE_SIZE if 'company' in article: for i, companyCode in enumerate(article['company']): if i >= ARTICLE_EXPORT_CODE_SIZE: break companyCodeList[i] = companyCode else: article['company'] = [articleCompanyCode] companyCodeList = article['company'] if 'newsSubject' in article: for i, subjectCode in enumerate(article['newsSubject']): if i >= ARTICLE_EXPORT_CODE_SIZE: break subjectCodeList[i] = subjectCode else: article['newsSubject'] = [] subjectCodeList = article['newsSubject'] self._db.saveArticle(article) lineList = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'], article['byline']] + companyCodeList + subjectCodeList writer.writerow(lineList) except Exception as e: print(e) def processKeywordSearch(self): searchString = self._args[0] while True: article = self._taskQueue.get() if article == END_OF_QUEUE: break else: articlePathPartList = article['filePath'].split('/') articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2] articleCompany = self._db.getCompanyByCode(articleCompanyCode) articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name'] articleSentenceList = [] #here, use '|' to combine regex is OK, because sentence is short, will not reduce the performance that much. #But in DB search, use iterative way. pattern = getPatternByKeywordSearchString(searchString) #on sentence level first, if can't find, go to paragraph level. for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]: sentenceList = sent_tokenize(paragraph) for sentence in sentenceList: if re.search(pattern, sentence) is not None: articleSentenceList.append(sentence.encode('utf-8').strip()) if not articleSentenceList: #search on paragraph level for paragraph in [article['headline'], article['byline'], article['leadParagraph'], article['tailParagraph']]: if re.search(pattern, paragraph) is not None: articleSentenceList.append(paragraph.encode('utf-8').strip()) lineList = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'].strip(), article['byline'].strip(), article['headline'].strip(), '\t'.join(articleSentenceList)] self._resultQueue.put(lineList) def processCitationBlock(self): #because list is too long, we need to separate name in to chunk brokerNameList = list(self._db.getAllBrokerageEffectiveNameList()) brokerageNamePatternList = [] for i in range(0, len(brokerNameList), 500): brokerageNamePatternList.append(re.compile(r'|'.join([r'\b' + name + r'\b' for name in brokerNameList[i : i + 500]]), re.IGNORECASE)) quotePattern = re.compile(r'\"[^\"]+\"') citeWordPatternStringList = [(r'\b' + citeWord + r'\b') for citeWord in self._citeWordList] companyCEODict = self._db.getAllCompanyCEODict() engagerNamePattern = re.compile(r'|'.join(['CEO', 'analyst', 'executive']), re.IGNORECASE) citeWordPattern = re.compile(r'|'.join(citeWordPatternStringList), re.IGNORECASE) wordMatchPatternList = [getWordRegexPattern(WORD_CAUSE_IN), getWordRegexPattern(WORD_CAUSE_EX), getWordRegexPattern(WORD_CONTROL_LOW), getWordRegexPattern(WORD_CONTROL_HIGH), getWordRegexPattern(MCD_POS), getWordRegexPattern(MCD_NEG), getWordRegexPattern(MCD_UNCERTAIN)] filterWordDict = getWordDict(WORD_FILTER) while True: #process in batch articleBatch = self._taskQueue.get() if articleBatch == END_OF_QUEUE: self._taskQueue.task_done() break else: lineListBatch = [] toProcessSentenceBatch = [] sentenceTextIndex, NERStartIndex, NERPartCount, wordMatchStartIndex = 9, 12, 5, 18 #add byline_cleaned in articleDict self.processBylineInBatch(articleBatch) for article in articleBatch: self._db.setArticleProcessed(article['_id']) articlePathPartList = article['filePath'].split('/') articleCompanyCode = articlePathPartList[-3] if articlePathPartList[-2] == 'a' else articlePathPartList[-2] articleCompany = self._db.getCompanyByCode(articleCompanyCode) articleCompanyName = articleCompanyCode if articleCompany is None else articleCompany['name'] articleLineListPart = [articleCompanyCode, articleCompanyName, article['filePath'], article['_id'], article['date'], article['sourceName'].strip(), article['byline'].strip(), article['byline_cleaned'], article['headline'].strip()] for paragraph in [article['leadParagraph'], article['tailParagraph']]: #if found qouted part in this paragraph quotedStringList = re.findall(quotePattern, paragraph) if quotedStringList and max([len(string.split()) for string in quotedStringList]) > 5: #Among all the quoted parts, the max word count MUST bigger than 5 #If so, then get all sentences sentenceList = sent_tokenize(paragraph) for sentence in sentenceList: quotedStringList = re.findall(quotePattern, sentence) citeWordList = re.findall(citeWordPattern, sentence) #If this sentence has quotation and quoted part word cout is bigger than 5 and has cite word #Then parse it add to the export if quotedStringList and max([len(string.split()) for string in quotedStringList]) > 5 and citeWordList: lineList = articleLineListPart + [sentence, '. '.join(quotedStringList), ', '.join(citeWordList)] + [''] * NERPartCount + [len(sentence.split())] + [''] * len(wordMatchPatternList) * 2 # Macth the keyword in dictionary for i, pattern in enumerate(wordMatchPatternList): matchedWordList = getMatchWordListFromPattern(sentence, pattern, filterWordDict) lineList[i + wordMatchStartIndex] = len(matchedWordList) lineList[i + len(wordMatchPatternList) + wordMatchStartIndex] = ', '.join(matchedWordList) lineListBatch.append(lineList) toProcessSentenceBatch.append(sentence) actorAndOrgListBatch = self.processCiteSentenceInBatch(toProcessSentenceBatch) for i, actorAndOrgList in enumerate(actorAndOrgListBatch): if actorAndOrgList is not None: engagerNameList = re.findall(engagerNamePattern, lineListBatch[i][sentenceTextIndex]) FCEO = 0 articleCompanyCode = lineListBatch[i][0] for name in actorAndOrgList[0].split(', '): for namePart in name.split(): if articleCompanyCode in companyCEODict and companyCEODict[articleCompanyCode].find(namePart) != -1: FCEO = 1 lineListBatch[i][NERStartIndex] = actorAndOrgList[0] lineListBatch[i][NERStartIndex + 1] = actorAndOrgList[1] lineListBatch[i][NERStartIndex + 2] = ' '.join(engagerNameList) lineListBatch[i][NERStartIndex + 3] = FCEO unQuotedPart = re.sub(r'"[^"]+"', '', lineListBatch[i][sentenceTextIndex]) findBrokerage = False for pattern in brokerageNamePatternList: result = pattern.search(unQuotedPart) if result is not None and result.string[result.regs[0][0]].isupper(): findBrokerage = True break lineListBatch[i][NERStartIndex + 4] = 1 if findBrokerage else 0 self._resultQueue.put(lineListBatch[i]) self._taskQueue.task_done() def getNERTaggedTupleListFromSentence(self, sentence): #use senna name entity tagger, it fast!! sentence = unicode(sentence).encode('utf-8', 'ignore') with open('temp/input.txt', 'w') as f: f.write(sentence) os.system('./senna/senna -path senna/ -ner <temp/input.txt> temp/output.txt') with open('temp/output.txt', 'r') as f: tagTupleList = [[word.strip().split('-')[-1] if i ==1 else word.strip() for i, word in enumerate(line.split())] for line in f.readlines() if line.split()] return tagTupleList def processBylineInBatch(self, articleBatch): #use '.' to replace '' of byline, because if the last sentence byline is '', it will not be add to concatenated string. tagTupleList = self.getNERTaggedTupleListFromSentence(' ****** '.join([article['byline'] if article['byline'] else 'null.' for article in articleBatch])) personList, lastTag, wordList = [], '', [] articleIndex = 0 for i in range(len(tagTupleList)): if tagTupleList[i][1] != lastTag: if lastTag == 'PER': personList.append(' '.join(wordList)) wordList = [tagTupleList[i][0]] lastTag = tagTupleList[i][1] else: wordList.append(tagTupleList[i][0]) if tagTupleList[i][0].find('****') != -1 or i == len(tagTupleList) - 1: #end of one sentence articleBatch[articleIndex]['byline_cleaned'] = ', '.join(personList) if personList else '' personList, lastTag, wordList = [], '', [] articleIndex = articleIndex + 1 if i != len(tagTupleList) - 1 else articleIndex if articleIndex >= len(articleBatch): return while articleIndex < len(articleBatch) : articleBatch[articleIndex]['byline_cleaned'] = '' articleIndex += 1 def processCiteSentenceInBatch(self, sentenceBatch): tagTupleList = self.getNERTaggedTupleListFromSentence(' ****** '.join(sentenceBatch)) personAndOrgListBatch = [] personList, orgnizationList, inQuoteFlag, lastTag, wordList = [], [], False, '', [] for i in range(len(tagTupleList)): if tagTupleList[i][0] == '\"': inQuoteFlag = 1 - inQuoteFlag if not inQuoteFlag: del wordList[:] else: if not inQuoteFlag: if tagTupleList[i][1] != lastTag: if lastTag == 'PER': personList.append(' '.join(wordList)) elif lastTag == 'ORG': orgnizationList.append(' '.join(wordList)) wordList = [tagTupleList[i][0]] lastTag = tagTupleList[i][1] else: wordList.append(tagTupleList[i][0]) if tagTupleList[i][0].find('****') != -1 or i == len(tagTupleList) - 1: #end of one sentence if not personList and not orgnizationList: personAndOrgListBatch.append(None) else: personAndOrgListBatch.append([', '.join(personList), ', '.join(orgnizationList)]) personList, orgnizationList, inQuoteFlag, lastTag, wordList = [], [], False, '', [] return personAndOrgListBatch def run(self): self._executeFunction()
class DataExporter(object): def __init__(self): self.db = DBController() def valueToCSVFormat(self, value): if value == '' or value is None: return '' elif isinstance(value, str) or isinstance(value, unicode): return '\"' + value.encode('utf-8', 'ignore') + '\"' elif isinstance(value, int) or isinstance(value, float): return str(value) elif isinstance(value, datetime): return datetime.strftime(value, '%Y-%m-%d') else: raise Exception('Value data type ERROR, must be string, int, float or datetime') def exportHistory(self, isDietHistory): if isDietHistory: key = 'dietHistory' fileName = key + '.csv' attributeLine = 'id,name,count,lastUpdateDate,updateTime,food,RDI,fat,protein,carbs,exercise,net\n' else: key = 'weightHistory' fileName = key + '.csv' attributeLine = 'id,name,count,lastUpdateDate,lastUpdateWeight,startWeight,goalWeight,updateTime,weight\n' with open(fileName, 'w') as f: f.write(attributeLine) users = self.db.getAllUserList() users = sorted(users, key=lambda user : user['id']) for user in users: lineList = [user['id'], user['name']] try: if key in user and user[key] is not None and len(user[key]) > 0: lineList.append(len(user[key])) lineList.append(user[key][-1][0]) if not isDietHistory: lineList.append(user[key][-1][1]) lineList.append(user['startWeight']) lineList.append(user['goalWeight']) for detailInfoTuple in user[key]: lineList.extend(detailInfoTuple) else: lineList.append(0) line = ','.join([self.valueToCSVFormat(value) for value in lineList]) + '\n' f.write(line) except Exception as e: print e, user['id'] def exportGroupChallenge(self, isGroup): items = self.db.getAllGroupList() if isGroup else self.db.getAllChallengeList() items = sorted(items, key=lambda item : item['id']) fileName = 'group.csv' if isGroup else 'challenge.csv' with open(fileName, 'w') as f: f.write('id,name,count,memberId\n') for item in items: lineList = [item['id'], item['name']] if 'member' in item and item['member'] is not None: lineList.append(len(item['member'])) lineList.extend(item['member']) else: lineList.append(0) line = ','.join([self.valueToCSVFormat(value) for value in lineList]) + '\n' f.write(line) # def exportUserGroupChallenge(self, isGroup): # users = self.db.getAllUserIter() # directory = 'userGroup/' if isGroup else 'userChallenge/' # key = 'group' if isGroup else 'challenge' # if not os.path.exists(directory): # os.mkdir(directory) # for user in users: # fileName = directory + str(user['id']) + '.txt' # if key in user and user[key] is not None: # with open(fileName, 'w') as f: # itemList = sorted(user[key]) # for itemId in itemList: # line = str(itemId) + '\n' # f.write(line) # else: # with open(fileName, 'w') as f: # pass def exportUserGroupChallenge(self, isGroup): key = 'group' if isGroup else 'challenge' fileName = 'userGroup.csv' if isGroup else 'userChallenge.csv' attributeLine = 'id,count,group\n' if isGroup else 'id,count,challenge\n' with open(fileName, 'w') as f: users = self.db.getAllUserList() users = sorted(users, key=lambda user : user['id']) f.write(attributeLine) for user in users: lineList = [user['id']] if key in user and user[key] is not None: lineList.append(len(user[key])) lineList.extend(user[key]) else: lineList.append(0) line = ','.join([self.valueToCSVFormat(value) for value in lineList]) + '\n' f.write(line) def exportBuddy(self): users = self.db.getAllUserIter() if not os.path.exists('buddy/'): os.mkdir('buddy/') for user in users: fileName = 'buddy/' + str(user['id']) + '.txt' if 'buddy' in user and user['buddy'] is not None: with open(fileName, 'w') as f: buddyIdList = sorted(user['buddy']) for userId in buddyIdList: line = str(user['id']) + ' ' + str(userId) + '\n' f.write(line) else: with open(fileName, 'w') as f: pass def getUserIdNameDict(self): users = self.db.getAllUserIter() userIdNameDict = {} for user in users: userIdNameDict[user['id']] = user['name'] return userIdNameDict
def loadAllDialoguesFromFile(speakerTypeFilePath, folderPath): db = DBController() db.dropDB() ensuredIndex = False ADict, CDict, JDict, DotDict = {}, {}, {}, {} #load the speaker type csv file with open(speakerTypeFilePath, 'rU') as f: lines = csv.reader(f) for i, line in enumerate(lines): if i == 0: continue speakerName, speakerType, speakerId = line[14].strip(), line[15].strip().upper(), line[16].strip() if speakerType == TYPE_ANALYST: ADict[speakerName] = speakerId elif speakerType == TYPE_CEO: CDict[speakerName] = speakerId elif speakerType == TYPE_JOURNALIST: JDict[speakerName] = speakerId elif speakerType == TYPE_DOT: DotDict[speakerName] = speakerId else: print(speakerName, speakerType) for dirPath, dirNames, fileNames in os.walk(folderPath): print(dirPath) if os.path.split(dirPath)[-1].startswith('chunk'): for fileName in fileNames: try: if fileName.endswith('txt'): fileNameParts = [part.strip() for part in fileName.split('.txt')[0].split('_')] company, time = fileNameParts[0], fileNameParts[1] sessionType, sessionOrder, asker, answerer = fileNameParts[2], int(fileNameParts[3]), fileNameParts[4], fileNameParts[5] if fileNameParts[-1].endswith('default') or fileNameParts[-1].endswith('copy'): continue elif fileNameParts[-1][-1].isdigit() and not fileNameParts[-1][-2].isdigit(): speakerName = fileNameParts[-1][:-1].strip() speechOrder = int(fileNameParts[-1][-1:]) elif fileNameParts[-1][-1].isdigit() and fileNameParts[-1][-2].isdigit(): speakerName = fileNameParts[-1][:-2].strip() speechOrder = int(fileNameParts[-1][-2:]) else: continue conference = db.getConferenceByCompanyTime(company, time) if conference is None: conference = {'company' : company, 'time' : time} conference = db.insertConference(conference) session = db.getSessionByConferenceAndOrder(conference['_id'], sessionOrder) if session is None: session = {'conference' : conference['_id'], 'order' : speechOrder, 'type' : sessionType, 'asker' : asker, 'answerer' : answerer} session = db.insertSession(session) speech = db.getSpeechByConferenceIdAndSessionIdAndOrder(conference['_id'], session['_id'], speechOrder) if speech is None: if speakerName in ADict: speakerType, speakerId = TYPE_ANALYST, ADict[speakerName] elif speakerName in CDict: speakerType, speakerId = TYPE_CEO, CDict[speakerName] elif speakerName in JDict: speakerType, speakerId = TYPE_JOURNALIST, JDict[speakerName] elif speakerName in DotDict: speakerType, speakerId = TYPE_DOT, DotDict[speakerName] else: speakerType, speakerId = TYPE_DOT, '' print(fileName, speakerName) filePath = os.path.join(dirPath, fileName) with open(filePath, 'rU') as f: text = ' '.join(f.readlines()).strip() text = text.decode('ascii', 'ignore').encode('ascii', 'ignore') speech = {'conference' : conference['_id'], 'session' : session['_id'], 'order' : speechOrder, 'text' : text, 'speakerName' : speakerName, 'speakerType' : speakerType, 'speakerId' : speakerId} db.insertSpeech(speech) if not ensuredIndex: db.ensureIndex() ensuredIndex = True except Exception as e: print(fileName) print(e)
def extractFeatureToDB(self, beginWeek, endWeek=datetime.today(), isReload=False, useAlchemyAPI=False): if beginWeek < datetime(2007, 1, 7) or endWeek > datetime.today(): raise Exception('Invalid input date!') beginWeek, endWeek = dateToSaturday(beginWeek), dateToSaturday(endWeek) endWeek = endWeek - timedelta(days=7) if endWeek > datetime.today() else endWeek iterWeek = beginWeek db = DBController() while iterWeek <= endWeek: lastWeek = iterWeek - timedelta(days=7) songRankList = db.getSongIdListByWeek(lastWeek) for songId in songRankList: if isReload == False and db.isFeatureInDB(iterWeek, songId): continue featureDict = {} featureDict['id'] = songId featureDict['week'] = iterWeek featureDict['sales'] = db.getSalesRank(lastWeek, songId) featureDict['radio'] = db.getRadioRank(lastWeek, songId) featureDict['streaming'] = db.getStreamingRank(lastWeek, songId) featureDict['MVView'], featureDict['MVSocialInteraction'] = db.getIMVDBData(iterWeek, songId) featureDict['MTVReviewCount'], featureDict['MTVReviewScore'] = db.getMTVReviewData(iterWeek, songId, useAlchemyAPI) featureDict['youtubeCommentCount'], featureDict['youtubeCommentScore'] = db.getYoutubeData(iterWeek, songId, useAlchemyAPI) featureDict['twitterCount'], featureDict['twitterScore'] = db.getTwitterData(iterWeek, songId, useAlchemyAPI) featureDict['rank'] = db.getTop50Rank(iterWeek, songId) db.insertFeatureToDB(featureDict) iterWeek += timedelta(days=7)
class Crawler: def __init__(self, is_desktop): config = configparser.ConfigParser() config.read('config.ini') self.apk_directory = config.get('Setting', 'APK_DIRECTORY') self.is_desktop = is_desktop if (not is_desktop): self.display = Display(visible=0, size=(800, 600)) self.display.start() self.chrome = webdriver.Chrome(config.get('Setting',\ 'CHROME_DRIVER_DIRECTORY')) self.category_list = config.items('PlayStoreURL') self.db_connector = DBController(config.get('Setting', 'DB_DIRECTORY')) self.db_connector.create_table() def __get_new_app_list(self, popular_url): """ 카테고리별 인기차트를 Selenium 를 사용해 300개 앱의 메타정보를 가지고 온다. """ self.chrome.get(popular_url) self.chrome.implicitly_wait(10) # 해당 페이지를 스크롤해야만 300위까지의 앱이 나타남 for scroll in (10000, 20000, 30000, 40000, 50000): self.chrome.execute_script("window.scrollTo(0," \ + str(scroll) + ");") time.sleep(2) package_list = [] # selector를 사용해 300개의 앱 div를 가져옴 div_app_list = self.chrome.find_elements_by_css_selector(\ ".card.no-rationale.square-cover.apps.small") for div_app in div_app_list: app_detail = div_app.find_element_by_class_name('details') url = app_detail.find_element_by_class_name('title')\ .get_attribute('href') package_name = url.split('id=')[1] package_list.append(package_name) return package_list def __get_app_detail(self, package_list): """ 패키지 리스트를 입력으로 받아 앱별로 이름, 이미지소스,\ 업데이트날짜를 크롤링함 """ # 앱 상세정보 페이지에 들어가기위한 기본url # 뒤에 패키지 이름에 따라서 해당 앱 상세정보 페이지로 이동 base_url = 'https://play.google.com/store/apps/details?id=' detail_list = [] for package in package_list: app_url = base_url + package self.chrome.get(app_url) self.chrome.implicitly_wait(10) try: name = self.chrome.\ find_element_by_css_selector('.id-app-title').text img_src = self.chrome.\ find_element_by_css_selector('.cover-image').\ get_attribute('src') updated_date = self.chrome.\ find_elements_by_css_selector('.content')[0].text ratings = self.chrome.find_elements_by_css_selector( '.rating-count')[0].text if ',' in ratings: ratings = ratings.replace(',', '') except: print(package + " 오류 발생") print(package + " name, img_src, update_date 가져오기 실패") continue # 마지막에 None은 isDownloaded 컬럼에 해당된다. detail_list.append([name, package, img_src, updated_date, False]) return detail_list def __download_apk(self, package_name, download_url): """ APK파일을 HTTP request를 통해 다운받는 함수 리퀘스트를 보내는 도중 에러가 발생하면 False반환 정상적으로 파일이 저장완료되면 True반환 """ file_name = str(package_name) + '.apk' try: r = requests.get(download_url, timeout=60) with open(self.apk_directory + file_name, 'wb') as apk: apk.write(r.content) except requests.exceptions.Timeout as e: print('time out') return False except Exception as e: print(e) return False return True def crawl_new(self): # 카레고리별 플레이스토어 인기차트 긁어오기 for category in self.category_list: category_name = category[0] url = category[1] # Google Play Store를 크롤링하여 최신300개의 앱 메타정보를 가져오기 new_package_list = self.__get_new_app_list(url) # 최신앱 메타정보로 갱신한 리스트를 입력으로 주고 # 앱별로 상세정보를 크롤링함 # 이름, 업데이트날짜, 이미지소스 updated_app_list = self.__get_app_detail(new_package_list) # 새로 생긴된 데이터들을 DB에 업데이트 self.db_connector.update_app(updated_app_list, category_name) self.db_connector.commit_n_close() def crawl_old(self): for category in self.category_list: category_name = category[0] url = category[1] # 기존 DB에 존재하던 카테고리별 패키지 리스트를 가져오기 old_package_list = self.db_connector\ .get_old_category_app_list(category) # 최신앱 메타정보로 갱신한 리스트를 입력으로 주고 # 앱별로 상세정보를 크롤링함 # 이름, 업데이트날짜, 이미지소스 updated_app_list = self.__get_app_detail(old_package_list) # 새로 생긴된 데이터들을 DB에 업데이트 self.db_connector.update_app(updated_app_list, category_name) self.db_connector.commit_n_close() def update_apk(self): not_updated_list = self.db_connector.not_updated_list() print(1) for package_row in not_updated_list: package_name = package_row[0] search_url = 'http://apkpure.com/search?q=' + package_name self.chrome.get(search_url) self.chrome.implicitly_wait(10) # 패키지명으로 검색하여 일치하는 앱 찾기 search_titles = self.chrome.\ find_elements_by_class_name('search-title') # APK pure사이트에서 검색이 되지 않는 APK는 통과 if len(search_titles) == 0: logging.info(package_name + " is not searched") continue # 검색결과와 일치하는 앱은 href링크에 패키지 이름이 들어있음 link = '' for title in search_titles: link = title.find_element_by_tag_name('a') link = link.get_attribute('href') if package_name in link: break # 검색결과가 여러개 나오지만 일치하지 않는다면 통과 if link == '': logging.info(package_name + ' is not searched in APKpure') continue print(link) # debug self.chrome.get(link) self.chrome.implicitly_wait(10) a_list = self.chrome.find_elements_by_class_name(' down') try: for a in a_list: link = a.get_attribute('href') # href링크에 패키지 이름있는것이 있으면 발견! if package_name in link: self.chrome.get(link) self.chrome.implicitly_wait(10) break # 페이지 내부에 iframe을 못찾는 경우가 발생 # 못찾는다면 해당 APK는 무시하고 다음APK로 이동 iframe = self.chrome.find_element_by_id('iframe_download') src = iframe.get_attribute('src') except: logging.info(package_name + " does not have href or iframe") continue if (self.__download_apk(package_name, src)): self.db_connector.update_isdownload(package_name, True) else: self.db_connector.update_isdownload(package_name, False) self.db_connector.commit_n_close() def close(self): self.chrome.stop() if (not self.is_desktop): self.display.stop()
def __init__(self): self._resultQueue = Queue() self._taskQueue = Queue() self._db = DBController() self._threadNumber = 1 self._threadList = []
def __init__(self): self.db = DBController() self.clusterer = KMeans(n_clusters=TOTAL_CLUSTER)
def __init__(self): self.db = DBController() self.br = self.login()
def __iter__(self): for id in self.articles_id: yield BaseArticle(DBController.get_article(id))
while commentFeed is not None: for comment in commentFeed.entry: commentText = comment.content.text commentDate = dateStringToSaturday(comment.updated.text) commentList.append({'week' : commentDate, 'comment' : commentText}) next_link = commentFeed.GetNextLink() if next_link is None: commentFeed = None else: commentFeed = self.client.GetYouTubeVideoCommentFeed(next_link.href) except Exception, e: print e return commentList def extractYoutubeCommentsToDB(self, songList): db = DBController() for song in songList: try: searchVideoName = song['title'] + ' ' + song['artist'] videoID = self.getVideoID(searchVideoName) comments = self.getComments(videoID) db.insertCommentToDB(song['id'], comments) except Exception as e: print e continue if __name__ == '__main__': extractor = YoutubeCommentsExtractor() db = DBController() songList = db.getSongByWeek(lastSaturday()) extractor.extractYoutubeCommentsToDB(songList)
from Storage import Storage def check_db_status(): db_len = 0 for _ in DBController.get_all_articles(): db_len += 1 if db_len == 0: print('Seeding database...') DatabaseSeeder.seed() if __name__ == "__main__": check_db_status() articles = DBController.get_all_articles(limit=None) testing_sample = TrainingSample(articles) lsi = LsiModel(model_name='phyge') # lda = LdaModel(model_name='phyge') # d2v = D2vModel(model_name='phyge') # fast_text = FastTextModel(model_name='phyge') lsi.train_model(testing_sample) # lda.train_model(testing_sample) # d2v.train_model(testing_sample) # fast_text.train_model(testing_sample) Storage.save_model(lsi, path='out/lsi') # Storage.save_model(lda, path='out/lda')
def loadEngagerAndCompanyToDB(filePath): with open(filePath, 'rU') as f: db = DBController() reader = csv.reader(f) for i, line in enumerate(reader): line = [word.strip() for word in line] if i == 0: continue if db.getEngagerByName(line[5]) is None: engagerDict = {'name' : line[5], 'lastName' : line[6], 'type' : ENGAGER_CEO, 'gender' : line[-1]} db.insertEngager(engagerDict) if db.getCompanyByName(line[3]) is None: engagerDict = db.getEngagerByName(line[5]) companyDict = {'_id' : int(line[2]), 'name' : line[3], 'shortName' : line[4], 'code' : line[0], 'CEO' : {line[1] : engagerDict['_id']}} db.insertCompany(companyDict) else: engagerDict = db.getEngagerByName(line[5]) companyDict = db.getCompanyByName(line[3]) companyDict['CEO'][line[1]] = engagerDict['_id'] db.updateCompanyCEO(companyDict['_id'], companyDict['CEO']) for name in ['CEO', 'Executive']: engagerDict = {'name' : name, 'lastName' : name, 'type' : ENGAGER_CEO} db.insertEngager(engagerDict) for name in ['analyst']: engagerDict = {'name' : name, 'lastName' : name, 'type' : ENGAGER_ANALYST} db.insertEngager(engagerDict)
def __init__(self): self.__dbController = DBController('localhost', 27017)
def seed(cls): DBController.first_setup() cls.__seed_web_articles()
wrong_ids[model_name].append( (true_sourse, found_sourse)) output_answer.append( dict(true_sourse=true_sourse, sourse=found_sourse, model=model_name, title=answer[model_name][0]['title'], similarity=answer[model_name][0]['similarity'])) file.write(json.dumps(output_answer, indent=2, ensure_ascii=False)) file.write(json.dumps(wrong_ids, indent=2, ensure_ascii=False)) if __name__ == "__main__": log_of_result = [] if len(DBController.get_all_documents()) == 0: print('Seeding database...') DatabaseSeeder.seed() lsi = Storage.load_model('out/lsi', 'phyge', 'lsi') lda = Storage.load_model('out/lda', 'phyge', 'lda') d2v = Storage.load_model('out/d2v', 'phyge', 'd2v') fast_text = Storage.load_model('out/fast_text', 'phyge', 'ft') search_engine = SearchEngine(models=[fast_text, d2v, lda, lsi]) test_path = os.path.join(PhyVariables.testsDir, 'test_' + str(PhyVariables.queriesId)) run_search(os.path.join(test_path, PhyVariables.queriesFileName), os.path.join(test_path, PhyVariables.answersFileName), 1) # run_search('Resources/pdf_articles.json','Resources/answers.json',1)
# Author: jun10000 (https://github.com/jun10000) # import os import signal import sys import time from collections import deque from RPi import GPIO import settings import JunLib import wave_signal from DBController import DBController db_controller = DBController() def initialise(): GPIO.setmode(GPIO.BCM) GPIO.setup(settings.INPUT_PIN, GPIO.IN) global db_controller db_controller.connect() buffer = deque() # noinspection PyUnusedLocal def loop(signal_num, frame):
class DataExporterMaster(): def __init__(self): self._resultQueue = Queue() self._taskQueue = Queue() self._db = DBController() self._threadNumber = 1 self._threadList = [] def exportAllCitationBlock(self): #single thread is enough attributeList = ['cotic', 'coname', 'filePath', 'accessNo', 'date', 'source', 'byline', 'byline_cleaned', 'headline', 'sentence', 'cite_content', 'cite_word', 'actor', 'organization', 'engager', 'FCEO', 'broker'] attributeList += ['total_word_count', 'cau_int', 'cau_ext', 'cont_l', 'cont_h', 'pos', 'neg', 'uncert'] attributeList += ['cau_int_words', 'cau_ext_words', 'cont_l_words', 'cont_h_words', 'pos_words', 'neg_words', 'uncert_words'] #Comment this line if you wanna continue last time work and set the write mode to append 'a' self._db.setAllArticleUnprocessed() writer = CSVWriterThread(self._resultQueue, 'export/allCitationSentence.csv', attributeList, mode='w') writer.start() #must set to 100, otherwise there's bug batchSize = 100 for i in range(self._threadNumber): t = DataProcessorThread(self._taskQueue, self._resultQueue) t._executeFunction = t.processCitationBlock t.start() self._threadList.append(t) while True: isDone = False for i in range(self._threadNumber): articleBatch = list(self._db.getUnprocessedArticleInBatch(batchSize)) if articleBatch is None or not articleBatch: isDone = True break self._taskQueue.put(articleBatch) self._taskQueue.join() print('################') if isDone: break for i in range(self._threadNumber): self._taskQueue.put(END_OF_QUEUE) self._taskQueue.join() for t in self._threadList: t.join() self._resultQueue.put(END_OF_QUEUE) self._resultQueue.join() writer.join() def exportKeywordSearch(self, searchString): self._threadNumber = 4 attributeList = ['cotic', 'coname', 'filePath', 'accessNo', 'date', 'source', 'byline', 'headline', 'sentence'] writer = CSVWriterThread(self._resultQueue, 'export/keywordSearch.csv', attributeList) writer.start() for i in range(self._threadNumber): t = DataProcessorThread(self._taskQueue, self._resultQueue, searchString) t._executeFunction = t.processKeywordSearch t.start() self._threadList.append(t) articleListCursor = self._db.getAllArticleBySearchString(searchString) #it's cursor here!! for article in articleListCursor: self._taskQueue.put(article) for i in range(self._threadNumber): self._taskQueue.put(END_OF_QUEUE) for t in self._threadList: t.join() self._resultQueue.put(END_OF_QUEUE) writer.join()
class SignifierParser(object): def __init__(self): self.db = DBController() self.pfmWord = getWordList(WORD_PFM) self.posWord = getWordList(WORD_POS) self.negWord = getWordList(WORD_NEG) self.exWord = getWordList(ATRB_EX) self.inWord = getWordList(ATRB_IN) self.citeWord = getWordList(WORD_CITE) self.engagerList = list(self.db.getAllEngager()) self.companyList = list(self.db.getAllCompany()) self.engagerRegexPatternDict, self.companyRegexPatternDict = self.getRegexPatternDictForEngagerAndCompany() def getRegexPatternDictForEngagerAndCompany(self): engagerRegexPatternDict, companyRegexPatternDict = {}, {} for engager in self.engagerList: if engager['lastName'] == 'Jones' or engager['lastName'] == 'Johnson' or engager['lastName'] == 'West' or engager['lastName'] == 'Post' or engager['lastName'] == 'Ford': searchName = engager['name'] else: searchName = engager['lastName'] engagerRegexPatternDict[engager['_id']] = re.compile(r'\b' + searchName + r'\b') for company in self.companyList: companyRegexPatternDict[company['_id']] = re.compile(r'\b' + company['shortName'] + r'\b', re.IGNORECASE) return engagerRegexPatternDict, companyRegexPatternDict def extractAllSentenceToDB(self, isReload=False): if isReload: self.db.dropSentence() # for company in self.companies: for i, company in enumerate(self.companyList): articles = list(self.db.getAllArticleByCompanyCode(company['code'])) engagers = list(self.db.getAllEngagerByCompanyId(company['_id'])) for j, article in enumerate(articles): print(i, j) paragraphSet = ('leadParagraph', 'tailParagraph') for key in paragraphSet: paragraph = article[key] sentenceList = sent_tokenize(paragraph) for string in sentenceList: if not isValidSentence(string): continue sentenceDict = {'content' : string.encode('utf-8'), 'articleId' : article['_id'], 'paragraph' : key} sentenceDict = self.parseRawSentence(sentenceDict, engagers) if sentenceDict is not None: self.db.insertSentence(sentenceDict) def parseRawSentence(self, sentenceDict, engagerList): engagerIdList, companyIdList = [], [] for engager in engagerList: if self.engagerRegexPatternDict[engager['_id']].search(sentenceDict['content']) is not None: engagerIdList.append(engager['_id']) for company in self.companyList: if self.companyRegexPatternDict[company['_id']].search(sentenceDict['content']) is not None: companyIdList.append(company['_id']) if not engagerIdList and not companyIdList: return None else: sentenceDict['engager'] = list(set(engagerIdList)) sentenceDict['company'] = list(set(companyIdList)) return sentenceDict def parseAllSentenceCitation(self): sentences = list(self.db.getAllSentence()) for i, sentence in enumerate(sentences): print(i) words = getProcessedWordList(sentence['content'], VERB) sentence['cite'] = filter(lambda word : word in self.citeWord, words) sentence['citeCEO'], sentence['citeAnalyst'], sentence['citeCompany'] = self.isCiteInDistance(sentence) self.db.saveSentence(sentence) def parseAllSentencePfm(self): #list them all, becaue if loop with cursor and update cursor pointed sentence at meantime, the cursor will be screwed. sentences = list(self.db.getAllSentence()) for i, sentence in enumerate(sentences): print(i) pfmSentenceWordList = getProcessedWordList(sentence['content'], NOUN) pfmWordList = filter(lambda word : word in self.pfmWord, pfmSentenceWordList) posNegSentenceWordList = getProcessedWordList(sentence['content'], VERB) posWordList = filter(lambda word : word in self.posWord, posNegSentenceWordList) negWordList = filter(lambda word : word in self.negWord, posNegSentenceWordList) posWordList, negWordList = self.filterPosNegWordListByDistance(pfmSentenceWordList, posNegSentenceWordList, pfmWordList, posWordList, negWordList) self.db.updateSentencePfm(sentence['_id'], pfmWordList, posWordList, negWordList) def parseAllSentenceAtrb(self): sentences = list(self.db.getAllSentence()) for i, sentence in enumerate(sentences): print(i) words = getProcessedWordList(sentence['content'], NOUN) exWordList = filter(lambda word : word in self.exWord, words) inWordList = filter(lambda word : word in self.inWord, words) if ('ceo' in inWordList or 'executive' in inWordList) and sentence['cite']: inWordList = [] self.db.updateSentenceAtrb(sentence['_id'], exWordList, inWordList) def isCiteInDistance(self, sentence): #if (CEO or Company) and citation word happens within 5 word distance, capture isCiteCEO, isCiteAnalyst, isCiteCompany = False, False, False if sentence['cite']: wordList = getProcessedWordList(sentence['content'], VERB) for citeWord in sentence['cite']: citeIndex = wordList.index(citeWord) for engagerId in sentence['engager']: try: engager = self.db.getEngagerById(engagerId) matchName = engager['lastName'].lower() engagerIndex = wordList.index(matchName) if abs(citeIndex - engagerIndex) <= CITE_DISTANCE: if engager['type'] == ENGAGER_CEO: isCiteCEO = True else: isCiteAnalyst = True except: pass for companyId in sentence['company']: try: company = self.db.getCompanyById(companyId) matchName = company['shortName'].lower() companyIndex = wordList.index(matchName) if abs(citeIndex - companyIndex) <= CITE_DISTANCE: isCiteCompany = True except: pass return isCiteCEO, isCiteAnalyst, isCiteCompany def filterPosNegWordListByDistance(self, pfmSentenceWordList, posNegSentenceWordList, pfmWordList, posWordList, negWordList): filteredPosWordList, filteredNegWordList = [],[] for pfmWord in pfmWordList: pfmIndex = pfmSentenceWordList.index(pfmWord) for posWord in posWordList: posIndex = posNegSentenceWordList.index(posWord) if abs(pfmIndex - posIndex) <= PFM_DISTANCE: filteredPosWordList.append(posWord) for negWord in negWordList: negIndex = posNegSentenceWordList.index(negWord) if abs(pfmIndex - negIndex) <= PFM_DISTANCE: filteredNegWordList.append(negWord) return filteredPosWordList, filteredNegWordList
def __init__(self, title="Poopt Bank", size="500x400"): View.__init__(self, title, size) View.set_btn_callback(self, self.__btn_callback) db_controller = DBController("localhost", "pooptbank")
class test_DBController(unittest.TestCase): # def setUp(self): # self.collection = mongomock.MongoClient().db.collection # m1 = Member("name_test", "id_111", "email", "password") # post1 = {"_id": "123", "name": "graves", "date": "apr.9"} # self.collection.insert_one(post1) def setUp(self): self.database = mongomock.MongoClient().db post1 = { "_id": "393", "name": "software", "email_address": "*****@*****.**", "password": "******" } self.collection_admin = self.database.create_collection( "Administrator") self.collection_admin.insert_one(post1) post2 = { "_id": "123", "name": "Terry", "email_address": "*****@*****.**", "password": "******" } self.collection_member = self.database.create_collection("Member") self.collection_member.insert_one(post2) post3 = { "_id": "888", "name": "meeting", "start_time": "Apr10", "end_time": "Apr11", "location": "case" } self.collection_activity = self.database.create_collection("Activity") self.collection_activity.insert_one(post3) self.db = DBController(self.collection_member, self.collection_admin, self.collection_activity) def test_member_is_present(self): self.assertTrue(self.db.member_is_present("123")) self.assertFalse(self.db.member_is_present("000")) def test_add_member(self): member1 = Member("Marcus", "847", "*****@*****.**", "password") # member1 is not in the collection self.assertTrue(self.db.add_member(member1)) # member1 already exists self.assertFalse(self.db.add_member(member1)) # member2 already exists when construct member2 = Member("Terry", "123", "*****@*****.**", "pass") self.assertFalse(self.db.add_member(member2)) def test_update_member(self): member1 = Member("Marcus", "847", "*****@*****.**", "password") self.assertFalse(self.db.update_member(member1)) member2 = Member("Terry", "123", "*****@*****.**", "change_password") self.assertTrue(self.db.update_member(member2)) def test_retrieve_member(self): self.assertEqual(self.collection_member.find_one({"_id": "123"}), self.db.retrieve_member("123")) self.assertIsNone(self.db.retrieve_member("000")) def test_retrieve_member_name(self): member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_member(member1) self.assertEqual(self.db.retrieve_member_name("123"), "Terry") self.assertEqual(self.db.retrieve_member_name("847"), "Marcus") def test_delete_member(self): self.assertTrue(self.db.delete_member("123")) self.assertFalse(self.db.delete_member("123")) def test_member_login(self): self.assertTrue(self.db.member_login("123", "pass")) self.assertFalse(self.db.member_login("00", "pass")) self.assertFalse(self.db.member_login("123", "wrong_password")) def test_clubs_member_added(self): member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_member(member1) # self.assertIsNotNone(self.db.clubs_member_added("123")) self.assertIsNotNone(self.db.clubs_member_added("847")) self.assertIsNone(self.db.clubs_member_added("000")) def test_add_club_to_member(self): member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_member(member1) self.assertTrue(self.db.add_club_to_member("393", "847")) self.assertFalse(self.db.add_club_to_member("393", "847")) self.assertFalse(self.db.add_club_to_member("000", "847")) def test_remove_club_from_member(self): member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_member(member1) self.db.add_club_to_member("393", "847") self.assertTrue(self.db.remove_club_from_member("393", "847")) self.assertFalse(self.db.remove_club_from_member("000", "847")) def test_request_permission(self): admin1 = Administrator("terry", "110", "*****@*****.**", "pass") member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_admin(admin1) self.db.add_member(member1) self.assertTrue( self.db.request_permission("110", "847", "*****@*****.**", "Marcus")) self.assertFalse( self.db.request_permission("000", "847", "*****@*****.**", "Marcus")) self.assertFalse( self.db.request_permission("110", "847", "*****@*****.**", "Marcus")) def test_admin_is_present(self): self.assertTrue(self.db.admin_is_present("393")) self.assertFalse(self.db.admin_is_present("000")) def test_update_member_face_id(self): member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_member(member1) self.assertTrue(self.db.update_member_face_id("847", "face_id")) self.assertFalse(self.db.update_member_face_id("000", "face_id")) def test_retrieve_member_face_id(self): member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_member(member1) self.db.update_member_face_id("847", "face_id") self.assertEqual(self.db.retrieve_member_face_id("847"), "face_id") def test_add_admin(self): admin1 = Administrator("new_admin", "000", "*****@*****.**", "new_password") self.assertTrue(self.db.add_admin(admin1)) self.assertFalse(self.db.add_admin(admin1)) def test_update_admin(self): admin1 = Administrator("software", "393", "*****@*****.**", "new_password") self.assertTrue(self.db.update_admin(admin1)) admin2 = Administrator("new_admin", "000", "*****@*****.**", "new_password") self.assertFalse((self.db.update_admin(admin2))) def test_retrieve_admin(self): self.assertEqual(self.db.retrieve_admin("393"), self.collection_admin.find_one({"_id": "393"})) self.assertIsNone(self.db.retrieve_admin("000")) def test_delete_admin(self): self.assertTrue(self.db.delete_admin("393")) self.assertFalse(self.db.delete_admin("000")) def test_admin_login(self): self.assertTrue(self.db.admin_login("393", "pass")) self.assertFalse(self.db.admin_login("393", "wrong_pass")) self.assertFalse(self.db.admin_login("000", "pass")) def test_add_member_to_added_members(self): admin1 = Administrator("terry", "110", "*****@*****.**", "pass") member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_admin(admin1) self.db.add_member(member1) self.assertTrue(self.db.add_member_to_added_members("110", "847")) self.assertFalse(self.db.add_member_to_added_members("110", "847")) self.assertFalse(self.db.add_member_to_added_members("000", "847")) def test_remove_member_from_added_members(self): admin1 = Administrator("terry", "110", "*****@*****.**", "pass") member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_admin(admin1) self.db.add_member(member1) self.db.add_member_to_added_members("110", "847") self.assertFalse(self.db.remove_member_from_added_members( "000", "847")) self.assertFalse(self.db.remove_member_from_added_members( "110", "000")) self.assertTrue(self.db.remove_member_from_added_members("110", "847")) def test_add_member_to_pending_members(self): admin1 = Administrator("terry", "110", "*****@*****.**", "pass") member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_admin(admin1) self.db.add_member(member1) self.assertTrue(self.db.add_member_to_pending_members("110", "847")) self.assertFalse(self.db.add_member_to_pending_members("000", "847")) def test_remove_member_from_pending_members(self): admin1 = Administrator("terry", "110", "*****@*****.**", "pass") member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_admin(admin1) self.db.add_member(member1) self.db.add_member_to_pending_members("110", "847") self.assertFalse( self.db.remove_member_from_pending_members("000", "847")) self.assertFalse( self.db.remove_member_from_pending_members("110", "000")) self.assertTrue( self.db.remove_member_from_pending_members("110", "847")) def test_added_members(self): admin1 = Administrator("terry", "110", "*****@*****.**", "pass") self.db.add_admin(admin1) self.assertIsNotNone(self.db.added_members("110")) self.assertEqual([], self.db.added_members("110")) def test_pending_members(self): admin1 = Administrator("terry", "110", "*****@*****.**", "pass") self.db.add_admin(admin1) self.assertIsNotNone(self.db.pending_members("110")) self.assertEqual([], self.db.pending_members("110")) def test_permit(self): admin1 = Administrator("terry", "110", "*****@*****.**", "pass") member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_admin(admin1) self.db.add_member(member1) self.db.add_member_to_pending_members("110", "847") self.assertTrue(self.db.permit("*****@*****.**", "110", "terry")) def test_reject(self): admin1 = Administrator("terry", "110", "*****@*****.**", "pass") member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_admin(admin1) self.db.add_member(member1) self.db.add_member_to_pending_members("110", "847") self.assertTrue(self.db.reject("*****@*****.**", "110", "terry")) def test_activity_is_present(self): self.assertTrue(self.db.activity_is_present("888")) self.assertFalse(self.db.activity_is_present("000")) def test_add_activity(self): activity1 = Activity("999", "party", datetime(2020, 2, 2, 3, 20), datetime(2020, 2, 2, 4, 30), "case") self.assertTrue(self.db.add_activity(activity1, "393")) self.assertFalse(self.db.add_activity(activity1, "393")) def test_update_activity(self): activity1 = Activity("888", "party", datetime(2020, 2, 2, 3, 20), datetime(2020, 2, 2, 4, 30), "case") activity2 = Activity("000", "party", datetime(2020, 2, 2, 3, 20), datetime(2020, 2, 2, 4, 30), "case") self.assertTrue(self.db.update_activity(activity1)) self.assertFalse(self.db.update_activity(activity2)) def test_retrieve_activity(self): self.assertEqual(self.collection_activity.find_one({"_id": "888"}), self.db.retrieve_activity("888")) self.assertIsNone(self.db.retrieve_activity("000")) def test_delete_activity(self): self.assertTrue(self.db.delete_activity("888")) self.assertFalse(self.db.delete_activity("000")) def test_add_activity_to_member(self): member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_member(member1) self.assertTrue( self.db.add_activity_to_member("393", "888", "847", "on_time")) self.assertFalse( self.db.add_activity_to_member("393", "888", "847", "on_time")) def test_set_member_activity_status(self): member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_member(member1) self.db.add_activity_to_member("393", "888", "847", "on_time") self.assertTrue( self.db.set_member_activity_status("393", "888", "847", "on_time")) self.assertFalse( self.db.set_member_activity_status("393", "000", "847", "on_time")) def test_remove_activity_from_member(self): member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_member(member1) self.db.add_activity_to_member("393", "888", "847", "on_time") self.assertTrue( self.db.remove_activity_from_member("393", "888", "847")) self.assertFalse( self.db.remove_activity_from_member("393", "888", "847")) def test_add_activity_to_admin(self): admin1 = Administrator("terry", "110", "*****@*****.**", "pass") self.db.add_admin(admin1) self.assertTrue(self.db.add_activity_to_admin("888", "110")) self.assertFalse(self.db.add_activity_to_admin("888", "000")) def test_remove_activity_from_admin(self): admin1 = Administrator("terry", "110", "*****@*****.**", "pass") self.db.add_admin(admin1) self.db.add_activity_to_admin("888", "110") self.assertTrue(self.db.remove_activity_from_admin("888", "110")) self.assertFalse(self.db.remove_activity_from_admin("888", "110")) def test_member_status_in_activity(self): member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_member(member1) self.db.add_activity_to_member("393", "888", "847", "on_time") self.db.set_member_activity_status("393", "888", "847", "on_time") self.assertEqual( self.db.member_status_in_activity("847", "393", "888"), "on_time") def test_member_activities(self): member1 = Member("Marcus", "847", "*****@*****.**", "password") self.db.add_member(member1) self.assertEqual(self.db.member_activities("847"), []) self.assertIsNone(self.db.member_activities("000")) def test_admin_activities(self): admin1 = Administrator("new_admin", "000", "*****@*****.**", "new_password") self.db.add_admin(admin1) self.assertEqual(self.db.admin_activities("000"), []) self.assertIsNone(self.db.admin_activities("111")) def test_activity_start_time(self): self.assertEqual(self.db.activity_start_time("888"), "Apr10") self.assertIsNone(self.db.activity_start_time("000")) def test_activity_end_time(self): self.assertEqual(self.db.activity_end_time("888"), "Apr11") self.assertIsNone(self.db.activity_end_time("000"))
def main(): dbc = DBController() #adbc = "jhfjsd" users = scrapeRFD(dbc) mailAll(users)
class Crawler: def __init__(self, is_desktop): """ 생성자 is_desktop : 서버환경에서 실행시키는지, 데크스탑환경(GUI)에서 실행시키는지\ (true, false) """ # config.ini파일의 변수 가져오기 config = configparser.ConfigParser() config.read('config.ini') self.apk_directory = config.get('Setting', 'APK_DIRECTORY') os.makedirs(self.apk_directory, exist_ok=True) self.is_desktop = is_desktop # 서버모드로 실행시켰다면 가상디스플레이 실행 chrome_options = webdriver.ChromeOptions() if (not is_desktop): self.display = Display(visible=0, size=(1024, 768)) self.display.start() chrome_options.add_argument('--headless') # 크롬 드라이버 실행 self.chrome = webdriver.Chrome(config.get('Setting', 'CHROME_DRIVER_DIRECTORY'), chrome_options=chrome_options) self.chrome.set_window_size(1024, 768) #self.chrome.set_page_load_timeout(30) # 크롤링할 디렉토리 리스트 저장 self.category_list = config.items('PlayStoreURL') # 데이터를 저장하고 제어할 DBController객체 생성 self.db_connector = DBController(config.get('Setting', 'DB_DIRECTORY')) # 메타데이터가 저장될 SQLite 테이블 생성 self.db_connector.create_table() def __get_new_app_list(self, popular_url): """ (private) 입력받은 인기차트 url의 상위 300개 앱 메타데이터 수집 popular_url : 특정 카테고리의 인기차트 URL """ # 크롬 드라이버 url 이동 및 완료 대기 self.chrome.get(popular_url) self.chrome.implicitly_wait(10) # 해당 페이지를 스크롤해야만 300위까지의 앱이 나타남 for scroll in (10000, 20000, 30000, 40000, 50000): self.chrome.execute_script("window.scrollTo(0," \ + str(scroll) + ");") time.sleep(2) package_list = [] # selector를 사용해 300개의 앱 div를 가져옴 div_app_list = self.chrome.find_elements_by_css_selector(\ ".card.no-rationale.square-cover.apps.small") # 300개의 div태그를 반복하면서 패키지 이름을 추출하여 리스트에 저장 for div_app in div_app_list: app_detail = div_app.find_element_by_class_name('details') url = app_detail.find_element_by_class_name('title')\ .get_attribute('href') package_name = url.split('id=')[1] package_list.append(package_name) #return package_list return package_list def __get_app_detail(self, package_list): """ (priavte) 패키지 리스트를 입력으로 받아 해당 패키지의 앱 이름, 이미지소스,\ 업데이트날짜, 별점 개수 (ratings)를 크롤링함 """ # 앱 상세정보 페이지에 들어가기위한 기본url # 뒤에 패키지 이름에 따라서 해당 앱 상세정보 페이지로 이동 base_url = 'https://play.google.com/store/apps/details?id=' detail_list = [] for package in package_list: app_url = base_url + package # 크롬 드라이버 페이지 이동 및 완료 대기 self.chrome.get(app_url) self.chrome.implicitly_wait(10) # 앱 이름, 이미지 소스, 최근 업데이트 날짜, 별점을 조회 try: name = self.chrome.find_element_by_css_selector( 'h1[itemprop="name"]').text.strip() except: name = package try: img_src = self.chrome.find_element_by_css_selector( 'img[alt="Cover art"]').get_attribute('src') except: img_src = 'https://upload.wikimedia.org/wikipedia/en/4/48/Blank.JPG' try: updated_date = self.chrome.find_element_by_css_selector( 'span[class="htlgb"]').text.strip() except: updated_date = 'January 1, 2000' try: ratings = self.chrome.find_element_by_css_selector( 'meta[itemprop="ratingValue"]').get_attribute('content') except: ratings = -1 # [앱 이름, 패키지 이름, 이미지 소스, 최신업데이트 날짜, 평점, APK다운 여부] print('FromPlayStore', name, package, img_src, updated_date, ratings) detail_list.append( [name, package, img_src, updated_date, ratings, False]) time.sleep(2) return detail_list def __download_apk(self, package_name, download_url): """ (private) HTTP request를 통해 APK파일을 다운받음 리퀘스트를 보내는 도중 에러가 발생하면 False반환 정상적으로 파일이 저장완료되면 True반환 package_name : 다운받으려는 패키지 이름 download_url : HTTP request를 날리는 url 이름 """ file_name = str(package_name) + '.apk' # timout 1분으로 설정하여 반응이 없는 것들은 예외처리 try: r = requests.get(download_url, timeout=60) # apk directory에 패키지이름.apk 형태로 저장 with open(self.apk_directory + file_name, 'wb') as apk: apk.write(r.content) except requests.exceptions.Timeout as e: print('time out') return False except Exception as e: print(e) return False return True def crawl_new(self): """ (public) 카테고리 별 플레이스토어 인기차트 크롤링 및 DB 저장 """ # TODO: list ranomization needed for category in self.category_list: category_name = category[0] url = category[1] # 하나의 카테고리 인기차트에서 300개의 앱 패키지 이름 가져오기 new_package_list = self.__get_new_app_list(url) # 카테고리의 300개 앱 패키지 이름으로 300개 앱 상세정보 수집 # (앱 이름, 최신 업데이트 날짜, 이미지 소스, 레이팅) updated_app_list = self.__get_app_detail(new_package_list) # 300개의 앱 메타 데이터를 DB에 업데이트 # 동일한 앱이 존재한다면 그대로 유지 # 하지만 동일한 앱에도 업데이트가 존재한다면 메타정보 업데이트 # 앱 이름이 DB에 없다면 새로 추가 self.db_connector.update_app(updated_app_list, category_name) self.db_connector.commit_n_close() def crawl_old(self): """ (public) 기존 DB에 저장된 앱 메타데이터를 최신으로 업데이트 """ for category in self.category_list: category_name = category[0] url = category[1] # 기존 DB에 존재하던 카테고리별 패키지 리스트를 가져오기 old_package_list = self.db_connector.get_old_category_app_list( category) # 기존 DB 메타데이터의 상세정보를 플레이스토어에서 크롤링 updated_app_list = self.__get_app_detail(old_package_list) # 새로 생긴된 데이터들을 DB에 업데이트 self.db_connector.update_app(updated_app_list, category_name) self.db_connector.commit_n_close() def update_apk(self): """ DB에서 다운받지 않은 APK파일을 찾아 APK파일을 다운로드 """ # DB에서 아직 다운받지 않은 APK파일의 리스트를 가져옴 not_updated_list = self.db_connector.not_updated_list() for package_row in not_updated_list: package_name = package_row[0] # apkpure.com에 패키지 이름으로 검색 search_url = 'http://apkpure.com/search?q=' + package_name self.chrome.get(search_url) self.chrome.implicitly_wait(10) # 일치하는 앱이 검색되었는지 확인 search_titles = self.chrome.\ find_elements_by_class_name('search-title') # 검색결과가 없으면 apk를 다운받을 수 없으므로 통과 if len(search_titles) == 0: print(package_name + " is not searched") continue # 검색결과와 일치하는 앱의 href 속성에서 다운로드 링크 추출 # 검색결과가 여러개일 경우가 있으므로 패키지 이름으로 다시 확인 link = '' for title in search_titles: link = title.find_element_by_tag_name('a') link = link.get_attribute('href') if package_name in link: break # 검색결과가 여러개 나오지만 패키지명이 일치하지 않는다면 통과 if link == '': print(package_name + ' is not searched in APKpure') continue # apk download링크로 이동 self.chrome.get(link) self.chrome.implicitly_wait(10) a_list = self.chrome.find_elements_by_class_name(' down') try: for a in a_list: link = a.get_attribute('href') # href링크에 패키지 이름있는것이 있으면 발견! if package_name in link: self.chrome.get(link) self.chrome.implicitly_wait(10) break # 페이지 내부에 iframe을 못찾는 경우가 발생 # 못찾는다면 해당 APK는 무시하고 다음APK로 이동 iframe = self.chrome.find_element_by_id('iframe_download') src = iframe.get_attribute('src') except: print(package_name + " does not have href or iframe") continue # apk 파일 다운로드가 성공하면 db에 True로 저장, 실패시 False로 저장 if (self.__download_apk(package_name, src)): self.db_connector.update_is_downloaded(package_name, True) print(package_name, 'downloaded') else: self.db_connector.update_is_downloaded(package_name, False) print(package_name, 'no-downloaded') time.sleep(2) self.db_connector.commit_n_close() def close(self): self.chrome.close() if (not self.is_desktop): self.display.stop()
from DBController import DBController from pymongo import MongoClient from Member import Member from Administrator import Administrator from Activity import Activity cluster = MongoClient( "mongodb+srv://wz:1999314Zwh%[email protected]/test?retryWrites=true&w" "=majority") db = cluster["AMS"] collection_member = db["Member"] collection_admin = db["Administrator"] collection_activity = db["Activity"] db_controller = DBController(collection_member, collection_admin, collection_activity) # for i in range (0, 100): # member = Member(str(i), str(i), "*****@*****.**", "member") # db_controller.add_member(member) # for i in range (0 ,100): # member = Member("updated name", str(i), "updated email", "updated password") # db_controller.update_member(member) # for i in range (0 ,100): # db_controller.retrieve_member(str(i)) # for i in range (0 ,100): # db_controller.delete_member(str(i))
class DataExtractor(object): def __init__(self): self.db = DBController() self.br = self.login() def login(self): br = Browser() cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(_http.HTTPRefreshProcessor(), max_time=2) br.open('http://www.fatsecret.com/Auth.aspx?pa=s') br.select_form(nr=0) #name attr of login tr #PLEASE input your username and password here!!!! br['_ctl0:_ctl7:Logincontrol1:Name'] = 'username' br['_ctl0:_ctl7:Logincontrol1:Password'] = '******' br.submit() return br #======================================================================================== # URLType: 0 memberURL, 1 weightHistory, 2 dietHistory, 3 groups, 4 challenges, 5 buddies #======================================================================================== def getURL(self, user, URLType): if URLType == 0: return 'http://fatsecret.com/member/' + '+'.join(user['name'].encode('utf-8', 'ignore').split()) if user['serverId'] is None: return None elif URLType == 1: return 'http://www.fatsecret.com/Default.aspx?pa=memh&id=' + user['serverId'] elif URLType == 2: return 'http://www.fatsecret.com/Diary.aspx?pa=mdcs&id=' + user['serverId'] elif URLType == 3: return 'http://www.fatsecret.com/Default.aspx?pa=memgrps&id=' + user['serverId'] elif URLType == 4: return 'http://www.fatsecret.com/Default.aspx?pa=memchals&id=' + user['serverId'] elif URLType == 5: return 'http://www.fatsecret.com/Default.aspx?pa=memb&id=' + user['serverId'] else: raise Exception('invalid URL type') def convertUserIdToUserList(self, userId): if userId is None or userId == []: return self.db.getAllUserList() elif isinstance(userId, list) and userId != []: userList = [] for v in userId: user = self.db.getUserById(v) if user is not None: userList.append(user) return userList elif isinstance(userId, int): user = self.db.getUserById(userId) return [user] if user is not None else [] else: raise Exception('invalid input userId') def getServerId(self, userId=None): users = self.convertUserIdToUserList(userId) for user in users: if 'serverId' in user and user['serverId'] is not None: continue serverId = None try: memberURL = self.getURL(user, 0) page = self.br.open(memberURL) soup = BeautifulSoup(page.read()) result = soup.find('div', attrs={'align' : 'right', 'class' : 'smallText', 'style' : 'padding-top:5px'}) if result is not None: for tag in result.contents: if isinstance(tag, element.Tag) and 'href' in tag.attrs and tag.attrs['href'].find('id') != -1: serverId = tag.attrs['href'].split('id=')[1] break except Exception as e: logException(user['id'], self.getServerId.__name__, e) finally: self.db.updateServerId(user['id'], serverId) def getWeightHistory(self, userId=None): users = self.convertUserIdToUserList(userId) for user in users: diet, startWeight, goalWeight, weightHistory = None, None, None, None try: if user['serverId'] is not None: weightHistoryURL = self.getURL(user, 1) page = self.br.open(weightHistoryURL) soup = BeautifulSoup(page.read()) tag = soup.find('b') diet = tag.contents[1].text tag = soup.find(attrs={'style' : 'padding:0px 10px'}) startWeight = float(tag.contents[1].split(': ')[1].split()[0]) goalWeight = float(tag.contents[0].text.split(': ')[1].split()[0]) weightList, dateList = [], [] for tag in soup.findAll(attrs={'class' : 'borderBottom date'}): dateList.append(parser.parse(tag.text)) for tag in soup.findAll(attrs={'class' : 'borderBottom weight'}): weightList.append(float(tag.text.split()[0])) weightHistory = zip(dateList, weightList) weightHistory = sorted(weightHistory, key= lambda record : record[0]) except Exception as e: logException(user['id'], self.getWeightHistory.__name__, e) finally: self.db.updateWeightHistory(user['id'], diet, startWeight, goalWeight, weightHistory) def getDietHistory(self, userId=None): users = self.convertUserIdToUserList(userId) for user in users: dietHistory = None try: if user['serverId'] is not None: dietHistoryURL = self.getURL(user, 2) page = self.br.open(dietHistoryURL) soup = BeautifulSoup(page.read()) months = soup.findAll('td', attrs={'colspan' : '6', 'class' : 'borderBottom'}) monthList = [] if months == []: raise Exception('no diet history records') for month in months: monthList.append(datetime.strptime(month.text, '%B %Y')) rows = soup.findAll('tr', attrs={'valign' : 'middle'}) prevDay = 32 monthIndex = 0 dietHistory = [] for row in rows: try: if len(row.contents) != 13: continue day = int(re.sub('[^0-9]', '', row.contents[1].text)) if day >= prevDay: monthIndex += 1 prevDay = day date = datetime(monthList[monthIndex].year, monthList[monthIndex].month, day) food = self.getIntFromRawString(row.contents[3].text) RDI = self.getDecimalFromPercentageString(row.contents[5].text) fat, protein, carbs = self.getDataFromNutrionalSummary(row.contents[7].text) exercise = self.getIntFromRawString(row.contents[9].text) net = self.getIntFromRawString(row.contents[11].text) dietHistory.append((date, food, RDI, fat, protein, carbs, exercise, net)) except Exception as e: logException(user['id'], self.getDietHistory.__name__, e, 'scrape row error') if 'dietHistory' in user and user['dietHistory'] is not None: dietHistory = self.mergeDietTrack(user['dietHistory'], dietHistory) else: dietHistory.sort(key=lambda item : item[0]) except Exception as e: logException(user['id'], self.getDietHistory.__name__, e) finally: self.db.updateDietHistory(user['id'], dietHistory) def getGroup(self, userId=None): users = self.convertUserIdToUserList(userId) for user in users: groupIdList = [] try: if user['serverId'] is not None: groupURL = self.getURL(user, 3) page = self.br.open(groupURL) soup = BeautifulSoup(page.read()) results = soup.findAll('td', attrs={'width' : '50', 'align' : 'center'}) for tag in results: groupName = tag.contents[1].attrs['title'] group = self.db.addNewGroup(groupName) self.db.addUserInGroup(user['id'], group['id']) groupIdList.append(group['id']) except Exception as e: logException(user['id'],self.getGroup. __name__, e) finally: self.db.addGroupInUser(user['id'], groupIdList) def getChallenge(self, userId=None): users = self.convertUserIdToUserList(userId) for user in users: challengeIdList = [] try: if user['serverId'] is not None: challengeURL = self.getURL(user, 4) page = self.br.open(challengeURL) soup = BeautifulSoup(page.read()) results = soup.findAll('td', attrs={'width' : '50', 'align' : 'center'}) for tag in results: challengeName = tag.contents[1].attrs['title'] challenge = self.db.addNewChallenge(challengeName) self.db.addUserInChallenge(user['id'], challenge['id']) challengeIdList.append(challenge['id']) except Exception as e: logException(user['id'], self.getChallenge.__name__, e) finally: self.db.addChallengeInUser(user['id'], challengeIdList) def getBuddy(self, userId=None): users = self.convertUserIdToUserList(userId) for user in users: buddyIdList = [] try: if user['serverId'] is not None: buddyURL = self.getURL(user, 5) while True: page = self.br.open(buddyURL) soup = BeautifulSoup(page.read()) results = soup.findAll('a', attrs={'class' : 'member', 'onmouseout' : 'hideTip()'}) for tag in results: if tag.text != '': buddyName = tag.text.strip() buddy = self.db.addNewUser(buddyName) buddyIdList.append(buddy['id']) if 'serverId' not in buddy: self.getServerId(buddy['id']) result = soup.find('span', attrs={'class' : 'next'}) if result is None: break else: buddyURL = 'http://fatsecret.com/' + result.contents[0].attrs['href'] except Exception as e: logException(user['id'], self.getBuddy.__name__, e) finally: self.db.addBuddyInUser(user['id'], buddyIdList) def mergeDietTrack(self, oldTrack, newTrack): oldTrack, newTrack = sorted(oldTrack, key= lambda item : item[0]), sorted(newTrack, key= lambda item: item[0]) i = 0 for item in oldTrack: if item[0] >= newTrack[0][0]: break i += 1 return oldTrack[0 : i] + newTrack def cleanNonNumercial(self, dataString): return re.sub('[^0-9.]', '', dataString.strip()) def getIntFromRawString(self, dataString): dataString = self.cleanNonNumercial(dataString) return int(dataString) if dataString != '' else None def getDataFromNutrionalSummary(self, dataString): if dataString.strip() == '': return None, None, None fat = float(dataString.split('fat: ')[1].split('g')[0]) protein = float(dataString.split('protein: ')[1].split('g')[0]) carbs = float(dataString.split('carbs: ')[1].split('g')[0]) return fat, protein, carbs def getDecimalFromPercentageString(self, dataString): dataString = self.cleanNonNumercial(dataString) return float(self.cleanNonNumercial(dataString)) / 100 if dataString != '' else None