def student_list(school_id, turma_id): db = utils.get_database() school = utils.get_database().schools.find_one( {"_id": bson.ObjectId(school_id)}) data_filter = { "escola": school["name"], "turma": turma_id, } print repr(data_filter) result = db.weekly.aggregate([{ "$match": data_filter, }, { "$group": { "_id": "$aluno" } }, { "$project": { "_id": 1, "name": { "$concat": ["Aluno ", "$_id"] } } }, { "$sort": { "name": 1 } }])["result"] return result
def main(): empty_output_dir() client = utils.get_mongo_client() db = utils.get_database(client, 'omni') fake_id_dict = read_variants.read_fake_ids() disease_icd_dict = read_variants.read_disease_icd_dict() omni_to_jax_disease_dict = read_variants.read_omni_to_jax_disease_dict() disease_path_to_reportable_disease_dict = read_variants.read_omni_reportable_disease_name_dict( ) variant_groups_dict = read_variants.read_variant_groups_dict() io_drug_dict = additional_io.get_io_drug_dict() patients = read_variants.read_immune_results_file( get_immune_results_file_path()) read_variants.read_all_variants(patients, get_variant_file_path()) read_variants.read_summary_interprations(patients, get_summary_file_path()) strands = annotate.read_strands('data/strands.xlsx') num = 1 with open('output/manifest.txt', "w") as file: for order_id in patients.keys(): patient = patients[order_id] read_variants.add_patient_data( patient, fake_id_dict, disease_icd_dict, omni_to_jax_disease_dict, disease_path_to_reportable_disease_dict) handle_one_patient(patient, db, strands, variant_groups_dict, io_drug_dict) # create_recommendations(patient,db) create_one_report(patient) out_string = generate_manifest_string(num, order_id, patient) print(out_string) # file.write(out_string) num += 1
def main(): client = utils.get_mongo_client() db = utils.get_database(client, 'omni') path = 'data/TSO500_UniqueVariants_runs1-6.csv' # path = 'data/TSO500_UniqueVariants_4.csv' var_list = TSO500.read_tso_unique_variants(path) outF = open("data/decisions.tsv", "w") h = "gene\tcdot\tpdot\tgene_category\tmutation_type\treport_status\treasons\tlack_of_reasons\t" \ "is_protein_altering\tin_clinvar\tis_clinvar_benign\tis_clinvar_pathogenic\tclinvar_explain\t" \ "is_gain_of_function\tis_loss_of_function\thotspots\tpredicted_deleterious\tis_truncating_variants\tis_near_GOF_LOF_mutation\t" \ "omni_gene\tomni_cdot\tomni_pdot" # print(h) outF.write(h) outF.write("\n") for index, variant in enumerate(var_list): key = get_key_from_variant(variant) annotated = get_annotated_snv(key, db) if not 'reasons' in annotated: annotated['reasons'] = [] if not 'lack_of_reasons' in annotated: annotated['lack_of_reasons'] = [] is_tso_snv_reportable(annotated) s = f"{annotated['gene']}\t{annotated['cdot']}\t{annotated['pdot']}\t{annotated['gene_category']}\t{annotated['mutation_type']}\t" \ f"{annotated['report_status']}\t{annotated['reasons']}\t{annotated['lack_of_reasons']}\t" \ f"{annotated['is_protein_altering']}\t{annotated['in_clinvar']}\t{annotated['is_clinvar_benign']}\t{annotated['is_clinvar_pathogenic']}\t{annotated['clinvar_explain']}\t" \ f"{annotated['is_gain_of_function']}\t{annotated['is_loss_of_function']}\t{annotated['hotspots']}\t{annotated['predicted_deleterious']}\t" \ f"{annotated['is_truncating_variants']}\t{annotated['is_near_GOF_LOF_mutation']}\t" \ f"{annotated['HGNC_Symbol']}\t{annotated['omni_c_dot']}\t{annotated['omni_p_dot']}" # print(s) outF.write(s) outF.write("\n") outF.close()
def visualize_embeddings(database='mnist', model_dir='exp/mnist/run_13/', model_name='model_weights.h5', sprite=False, model=None): sprite_filename = '/home/daniel/models-tensorflow/tensorflow-triplet-loss/experiments/mnist_10k_sprite.png' tf.logging.set_verbosity(tf.logging.INFO) data, input_size = get_database(database) _, (x_test, y_test) = data # Load the parameters from json file if model is None or True: tf.reset_default_graph() K.clear_session() estimator = load_model(model_dir + model_name) else: estimator = model # Compute embeddings on the test set tf.logging.info("Predicting") embeddings = estimator.predict(x_test) tf.logging.info("Embeddings shape: {}".format(embeddings.shape)) # Visualize test embeddings embedding_var = tf.Variable(embeddings, name='embedding') eval_dir = os.path.join(model_dir, "log") summary_writer = tf.summary.FileWriter(eval_dir) config = projector.ProjectorConfig() embedding = config.embeddings.add() embedding.tensor_name = embedding_var.name # Specify where you find the sprite (we will create this later) # Copy the embedding sprite image to the eval directory if sprite: shutil.copy2(sprite_filename, eval_dir) embedding.sprite.image_path = pathlib.Path(sprite_filename).name embedding.sprite.single_image_dim.extend([28, 28]) # Specify where you find the metadata # Save the metadata file needed for Tensorboard projector metadata_filename = "metadata.tsv" with open(os.path.join(eval_dir, metadata_filename), 'w') as f: for i in range(len(y_test)): c = y_test[i] f.write('{}\n'.format(c)) embedding.metadata_path = metadata_filename # Say that you want to visualise the embeddings projector.visualize_embeddings(summary_writer, config) saver = tf.train.Saver() with K.get_session() as sess: sess.run(embedding_var.initializer) saver.save(sess, os.path.join(eval_dir, "embeddings.ckpt"))
def get_decreasing_schools(field_name="totalminutes", op="avg", extraFilter=None): db = utils.get_database() aggregateFilter = [] if extraFilter: aggregateFilter.append({"$match": extraFilter}) results = db.weekly.aggregate( list(aggregateFilter) + [{ '$group': { '_id': { "e": "$escola", "w": "$semana", }, 'avg': { '${}'.format(op): "${}".format(field_name) }, } }])["result"] all_schools = {} for r in results: school = r["_id"]["e"] week = r["_id"]["w"] avg = r["avg"] all_schools[school] = all_schools.get(school, {}) all_schools[school][week] = avg process_output = [] for school, data in all_schools.iteritems(): avg = sum(data.values()) / len(data.values()) current = sorted(data.keys())[-1] last = sorted(data.keys())[-2] diff = data[current] - data[last] out = { 'schoolName': school, 'avg': avg, 'week': last, 'current': data[current], 'last': data[last], 'diff': diff, 'percentDiff': float(diff) / data[last] * 100 if data[last] else None, } process_output.append(out) probs = filter(lambda x: x["diff"] < 0, process_output) probs.sort(key=lambda x: x['percentDiff']) return probs
def main(): client = utils.get_mongo_client() db = utils.get_database(client,'omni') path = 'data/TSO500_UniqueVariants_runs1-6.csv' # path = 'data/TSO500_UniqueVariants_4.csv' var_list = TSO500.read_tso_unique_variants(path) variants_to_analyze = [] for index, variant in enumerate(var_list): key = get_key_from_variant(variant) annotated = get_annotated_snv(key,db) if annotated == None: variants_to_analyze.append(variant) if len(variants_to_analyze) >= 100: report_on_variants(db, variants_to_analyze) variants_to_analyze = [] if len(variants_to_analyze)>0: report_on_variants(db, variants_to_analyze)
def school_problem_list(): data_filter = {} school_id = flask.request.values.get("school") or flask.request.values.get( "escola") if school_id: school_name = utils.get_database().schools.find_one( {"_id": bson.ObjectId(school_id)})["name"] data_filter["escola"] = school_name turma = flask.request.values.get("turma") if turma: data_filter["turma"] = turma aluno = flask.request.values.get("aluno") or flask.request.values.get( "student") if aluno: data_filter["aluno"] = aluno return map( _add_school_by_name, queries.get_decreasing_schools("totalminutes", extraFilter=data_filter))
def main(argv): # TODO(LuHa): load options options = utils.get_database('options.secret', across=True) # TODO(LuHa): print menu while True: print('\n----+----+ Edit options ----+----+') print('Current options') print(json.dumps(options, indent=' ')) print('1. Toggle downloaded image log') print('b. Back') user_input = input('User input: ') user_input = user_input.lower() user_input = user_input.strip() # TODO(LuHa): handle user input if user_input == '1': options['log'] = get_logging_level_from_user() elif user_input == 'b': break # TODO(LuHa): save options utils.set_database('options.secret', options, across=True)
#coding:utf8 from utils import get_database from utils import get_qianyue_database from utils import get_ltp_path import sys from pyltp import Segmentor from preprocessing import tokenizer import pickle from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer from tqdm import tqdm import random from texttable import Texttable import numpy wechat_db = get_database(sys.argv[1]).neaten_wechat2 account_info_db = get_database(sys.argv[1]).wechat_old_account_info def get_data_of_an_account(account_id): data = wechat_db.find_one({'_id': account_id}) if data is None: return data data['articles'], data['articles2'] = [], data['articles'] for article in data['articles2']: #if article['position'] == '头条': data['articles'].append(article) return data def get_account_ids(account_name):
try: next_state = state_handler(event, vk) except Exception as error: logging.exception(error) next_state = None if next_state is None: return user = db.get(f"vk_{user_id}") if user is not None: user_data = json.loads(user) user_data["state"] = next_state else: user_data = {"state": next_state} db.set(f"vk_{user_id}", json.dumps(user_data)) if __name__ == "__main__": load_dotenv() global db db = utils.get_database() vk_session = vk_api.VkApi(token=os.getenv("VK_TOKEN")) vk = vk_session.get_api() longpoll = VkLongPoll(vk_session) for event in longpoll.listen(): if event.type == VkEventType.MESSAGE_NEW and event.to_me: handle_user_reply(event, vk)
def main(): queue = get_message_queue(sys.argv[1], 'wechat_account_info_queue') db = get_database(sys.argv[1]).wechat_account_info firefoxProfile = FirefoxProfile() # firefoxProfile.set_preference('permissions.default.stylesheet', 2) firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') firefoxProfile.set_preference('permissions.default.image', 2) driver = webdriver.Firefox(firefoxProfile) driver.set_page_load_timeout(30) driver.implicitly_wait(5) login_from_cookie(driver) try: driver.find_element_by_class_name('new-header-login.unlogin') login_by_user() login_from_cookie(driver) except Exception as e: pass while 1: if queue.empty(): print('Already finished') print('Waiting for new query ...') #wechat_id = queue.get().decode() wechat_id = 'HIT_SCIR' print('Crawling %s' % wechat_id) url = 'http://www.newrank.cn/public/info/detail.html?account=%s' % wechat_id try: driver.get(url) except: print('!!!!!!!!!!!!!Cannot get web page!!!!!!!!') time.sleep(3) continue if (driver.title == u'页面错误'): print('%s not included' % (wechat_id)) continue locator = contain_something([ './/*[@class="tag-name-list"]//li[1]', './/*[@class="tag-name-list"]//a[1]' ]) try: WebDriverWait(driver, 10, 0.5).until(locator) info_tree = etree.HTML( driver.find_element_by_class_name('info-detail-head') .get_attribute('innerHTML')) account_info = dict() account_info['_id'] = wechat_id account_info['str_id'] = wechat_id account_info['name'] = info_tree.xpath( './/*[@class="info-detail-head-weixin-name"]/span')[0].xpath( 'string(.)').strip() account_info['description'] = info_tree.xpath( './/*[@class="info-detail-head-weixin-fun-introduce ellipsis"]/@title' )[0] account_info['category'] = info_tree.xpath( './/*[@class="info-detail-head-classify-subname"]/a/text()') account_info['fans_count'] = info_tree.xpath( './/*[@class="detail-fans-counts"]/@data')[0] try: driver.find_element_by_xpath( './/*[@class="info-detail-head-classify"]//*[@class="detail-edit info-detail-edit detail-pic"]' ).click() html = driver.find_element_by_id( 'current_tag_list').get_attribute('innerHTML') account_info['tags'] = etree.HTML(html).xpath('.//a/text()') except: account_info['tags'] = [] print(account_info) continue db.insert_one(account_info) except Exception as e: print(e) print('Error') with open('./fail_ids_for_user_info.dat', 'a') as fout: fout.write('%s\n' % wechat_id)
def _add_school_by_name(data): data["school"] = utils.get_database().schools.find_one( {"name": data["schoolName"]}) data["_id"] = data["school"]["_id"] return _add_weekly_report_link(data)
def main(argv): """ main flow """ # TODO(LuHa): print message about program execution #utils.logger.info( # '\x1B[38;5;5m[Danbooru] Execute danbooru downloader\x1B[0m') print('\x1B[38;5;5m[Danbooru] Execute danbooru downloader\x1B[0m') # TODO(LuHa): create downloads directory # actually, this code use only downloads directory. # but to ensure execution of source code, # make save directory. os.makedirs('./downloads', exist_ok=True) os.makedirs('./save', exist_ok=True) # TODO(LuHa): load ban database ban_db = utils.get_database('ban.secret') # TODO(LuHa): load mute database mute_db = utils.get_database('mute.secret') # TODO(LuHa): read pre-downloaded image downloaded = utils.get_downloaded_images('danbooru') # TODO(LuHa): load tags if os.path.exists('tags.secret'): with open('tags.secret', 'r') as f_tags: tags = json.load(f_tags) tags = tags['danbooru'] else: utils.logger.error('[Danbooru] Need tags in file named tags.secret') return # TODO(LuHa): load API keys if os.path.exists('danbooru_api.secret'): utils.logger.debug('[Danbooru] API key exists') with open('danbooru_api.secret', 'r') as f_api: api_key = f_api.read() api_key = api_key.strip() else: utils.logger.error( '[Danbooru] Need API key in file named danbooru_api.secret') utils.logger.error('[Danbooru] The format is ID:APIKEY') return # TODO(LuHa): create opener auth = api_key auth = auth.encode('ascii') auth = base64.b64encode(auth) auth = auth.decode('utf-8') opener = urllib.request.build_opener() opener.addheaders = [('Authorization', 'Basic ' + auth)] # TODO(LuHa): loop search by tags base_url = 'https://danbooru.donmai.us' # for fun random.shuffle(tags) for tag in tags: request_url = (base_url + '/posts.json?tags=' + tag + '&random=true') utils.logger.info( '\x1B[38;5;5m[Danbooru] Request: {0}\x1B[0m'.format(request_url)) response = opener.open(request_url, timeout=30) try: posts = json.loads(response.read().decode('utf-8')) except socket.timeout: utils.logger.info('\x1B[38;5;5m[Danbooru] Response timeout\x1B[0m') return # TODO(LuHa): loop download by posts # get 20 images at one time in dandooru for post in posts: # skip target image is already downloaded if post['id'] in downloaded: utils.logger.debug('[Danbooru] Already downloaded {0}'.format( post['id'])) continue elif post['id'] in ban_db['danbooru']: utils.logger.debug('[Danbooru] Ban downloaded {0}'.format( post['id'])) continue elif post['id'] in mute_db['danbooru']: utils.logger.debug('[Danbooru] Mute downloaded {0}'.format( post['id'])) continue else: downloaded.add(post['id']) # Change url rule at 180410 request_url = post['file_url'] try: response = opener.open(request_url, timeout=TIMEOUT) except socket.timeout: utils.logger.info('[Danbooru] Request timeout') return image_path = ('./downloads' + '/danbooru-' + str(post['id']) + '.' + post['file_ext']) with open(image_path, 'wb') as f: try: f.write(response.read()) except socket.timeout: utils.logger.info(('\x1B[38;5;5m[Danbooru] ' + 'Response timeout\x1B[0m')) return utils.logger.debug('[Danbooru] Downloaded {0}'.format(image_path)) # sleep for prevent block utils.dynamic_sleep() # TODO(LuHa): print message about program terminaion utils.logger.info( '\x1B[38;5;5m[Danbooru] Terminate danbooru downloader\x1B[0m')
dropout = 0.35 # Dropout probability of each layer. Conv layers use SpatialDropout2D blocks = 6 # Number of (Conv -> Act -> BN -> MaxPool -> Dropout) blocks n_channels = args[ 'channels'] # Number of channels (or feature maps) of the first convolution block. # the following ones are 1.5 times the number of channels of the previous block weight_decay = 1e-4 * 0 # dataloader parameters. # Folder's path where the files query.txt and bounding_box_train.txt are # query.txt contains the path and the class of test images # bounding_box_train.txt contains the path and the class of train images path = args['path'] exp_dir, log_dir, model_weights_path, model_name = get_dirs(database) print(exp_dir, log_dir, model_weights_path, model_name) data, input_size = get_database( database) # if database == 'skillup'. data is None im_size = input_size[:2] data_gen_args_train = dict( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 featurewise_std_normalization=False, # divide inputs by std of the dataset samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening rotation_range=10, # randomly rotate images in the range (degrees, 0 to 180) zoom_range=0.1, # Randomly zoom image width_shift_range= 0.1, # randomly shift images horizontally (fraction of total width) height_shift_range= 0.1, # randomly shift images vertically (fraction of total height) horizontal_flip=False, # randomly flip images
def main(): queue = get_message_queue(sys.argv[1], 'wechat_article_content_queue') firefoxProfile = FirefoxProfile() firefoxProfile.set_preference('permissions.default.stylesheet', 2) firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') firefoxProfile.set_preference('permissions.default.image', 2) # driver = webdriver.Firefox(firefoxProfile) driver = webdriver.PhantomJS(service_args=['--load-images=false']) print('Driver is ready') driver.implicitly_wait(5) db = get_database(sys.argv[1]).article_contents finished_count = 0 while 1: # time.sleep(random.uniform(1, 1)) if queue.empty(): print('Already finished') print('Waiting for new query ...') url = queue.get().decode() print('start') try: driver.get(url) except: print(url) print('!!!!!!!!!!!!!Cannot get web page!!!!!!!!') continue if driver.title.strip() == '': print(url) print('Empty title') continue locator = (By.XPATH, '//*[@id="page-content"]') try: WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located(locator)) data = dict() data['href'] = url page_content = etree.HTML( driver.find_element_by_id('page-content').get_attribute( 'innerHTML')) if page_content == None: print('Not find page-content') print(url) continue data['title'] = ''.join( page_content.xpath('//*[@id="activity-name"]/text()')).strip() data['post-user'] = ''.join( page_content.xpath('//*[@id="post-user"]/text()')).strip() data['post-date'] = ''.join( page_content.xpath('//*[@id="post-date"]/text()')).strip() data['origin'] = ''.join( page_content.xpath( '//*[@id="copyright_logo"]/text()')).strip() data['title2'] = ''.join( page_content.xpath( '//*[@class="rich_media_meta rich_media_meta_text"]/text()' )).strip() data['content'] = get_content(page_content) # print(data) print('end') assert (len(data['content']) > 0) db.insert_one(data) finished_count += 1 if finished_count % 100 == 0: print('Quit driver') driver.quit() time.sleep(random.random() * 3 + 1) driver = webdriver.PhantomJS( service_args=['--load-images=false']) print('Driver is ready') driver.implicitly_wait(5) except Exception as e: print(url) print('Error') print(e) with open('./fail_ids_for_article_content.data', 'a') as fout: fout.write('%s\n' % url) time.sleep(random.random() * 3 + 1)
def school_weekly_report(id): school = _get_school(id) db = utils.get_database() data_filter = { "escola": school["name"], } turma = flask.request.values.get("turma") if turma: data_filter["turma"] = turma aluno = flask.request.values.get("aluno") or flask.request.values.get( "student") if aluno: data_filter["aluno"] = aluno result = db.weekly.aggregate([ { "$match": data_filter, }, { "$group": { "_id": "$semana", "minTotalMinutes": { "$min": "$totalminutes" }, "minVideoMinutes": { "$min": "$videominutes" }, "minExerciseMinutes": { "$min": "$exerciseminutes" }, "minNivel1": { "$min": "$nivel1" }, "minNivel2": { "$min": "$nivel2" }, "minPrecisaPraticar": { "$min": "$precisa_praticar" }, "minPraticado": { "$min": "$praticado" }, "minDominado": { "$min": "$dominado" }, "minPontos": { "$min": "$pontos" }, "minComDificuldade": { "$min": "$com dificuldade" }, "maxTotalMinutes": { "$max": "$totalminutes" }, "maxVideoMinutes": { "$max": "$videominutes" }, "maxExerciseMinutes": { "$max": "$exerciseminutes" }, "maxNivel1": { "$max": "$nivel1" }, "maxNivel2": { "$max": "$nivel2" }, "maxPrecisaPraticar": { "$max": "$precisa_praticar" }, "maxPraticado": { "$max": "$praticado" }, "maxDominado": { "$max": "$dominado" }, "maxPontos": { "$max": "$pontos" }, "maxComDificuldade": { "$max": "$com dificuldade" }, "avgTotalMinutes": { "$avg": "$totalminutes" }, "avgVideoMinutes": { "$avg": "$videominutes" }, "avgExerciseMinutes": { "$avg": "$exerciseminutes" }, "avgNivel1": { "$avg": "$nivel1" }, "avgNivel2": { "$avg": "$nivel2" }, "avgPrecisaPraticar": { "$avg": "$precisa_praticar" }, "avgPraticado": { "$avg": "$praticado" }, "avgDominado": { "$avg": "$dominado" }, "avgPontos": { "$avg": "$pontos" }, "avgComDificuldade": { "$avg": "$com dificuldade" }, } }, { "$sort": { "_id": -1 } }, ])["result"] def _split_result(r): return { "_id": r["_id"], "week": r["_id"], "avg": { "totalMinutes": r["avgTotalMinutes"], "videoMinutes": r["avgVideoMinutes"], "exerciseMinutes": r["avgExerciseMinutes"], "nivel1": r["avgNivel1"], "nivel2": r["avgNivel2"], "precisaPraticar": r["avgPrecisaPraticar"], "praticado": r["avgPraticado"], "dominado": r["avgDominado"], "pontos": r["avgPontos"], "comDificuldade": r["avgComDificuldade"], }, "min": { "totalMinutes": r["minTotalMinutes"], "videoMinutes": r["minVideoMinutes"], "exerciseMinutes": r["minExerciseMinutes"], "nivel1": r["minNivel1"], "nivel2": r["minNivel2"], "precisaPraticar": r["minPrecisaPraticar"], "praticado": r["minPraticado"], "dominado": r["minDominado"], "pontos": r["minPontos"], "comDificuldade": r["minComDificuldade"], }, "max": { "totalMinutes": r["maxTotalMinutes"], "videoMinutes": r["maxVideoMinutes"], "exerciseMinutes": r["maxExerciseMinutes"], "nivel1": r["maxNivel1"], "nivel2": r["maxNivel2"], "precisaPraticar": r["maxPrecisaPraticar"], "praticado": r["maxPraticado"], "dominado": r["maxDominado"], "pontos": r["maxPontos"], "comDificuldade": r["maxComDificuldade"], }, } return map(_split_result, result)
def _get_school(id_): return utils.get_database().schools.find_one({"_id": bson.ObjectId(id_)})
def __init__(self, config_file): self.tmall_info_db = get_database(config_file).neaten_tmall self.wechat_info_db = get_database(config_file).neaten_wechat self.sp = ScoreProvider_V1(config_file)
def main(argv): """ main flow """ # TODO(LuHa): print message about program execution print('\x1B[38;5;5m[Pixiv] Execute pixiv downloader\x1B[0m') # TODO(LuHa): create downloads directory # actually, this code use only downloads directory. # but to ensure execution of source code, # make save directory. os.makedirs('./downloads', exist_ok=True) os.makedirs('./save', exist_ok=True) # TODO(LuHa): load ban database ban_db = utils.get_database('ban.secret') # TODO(LuHa): load mute database mute_db = utils.get_database('mute.secret') # TODO(LuHa): read pre-downloaded image downloaded = utils.get_downloaded_images('pixiv') # TODO(LuHa): load tags if os.path.exists('tags.secret'): with open('tags.secret', 'r') as f_tags: tags = json.load(f_tags) tags = tags['pixiv'] else: print('[Pixiv] Need tags in file named tags.secret') return # TODO(LuHa): load API keys if os.path.exists('pixiv_api.secret'): print('[Pixiv] API key exists') with open('pixiv_api.secret', 'r') as f_api: api_key = json.load(f_api) user_id = api_key['id'].strip() user_passwd = api_key['passwd'].strip() else: print('[Pixiv] Need User ud and passwd file ' + 'named pixiv_api.secret') print('[Pixiv] The format is below') print('{') print(' "id": "ID",') print(' "passwd": "PASSWD"') print('}') return # TODO(LuHa): load cookie from file cookie_jar = http.cookiejar.LWPCookieJar('pixiv_cookie.secret') if os.path.exists('pixiv_cookie.secret'): cookie_jar.load() cookie = urllib.request.HTTPCookieProcessor(cookie_jar) # TODO(LuHa): create opener opener = urllib.request.build_opener(cookie) opener.addheaders = [('User-agent', 'Mozilla/5.0'), ('Accept', 'text/html')] # TODO(LuHa) get hidden value for login hidden_parser = LoginTagParser() base_url = 'https://accounts.pixiv.net/' page_url = 'login' request_url = base_url + page_url response = opener.open(request_url, timeout=TIMEOUT) try: hidden_parser.feed(response.read().decode('utf-8')) except socket.timeout: print('\x1B[38;5;5m[Pixiv] Response timeout\x1B[0m') return auth = hidden_parser.get_hidden() # TODO(LuHa): if the cookie is not login, login with cookie try: if 'post_key' in auth.keys(): auth['pixiv_id'] = user_id auth['password'] = user_passwd auth = urllib.parse.urlencode(auth) auth = auth.encode('ascii') opener.open(request_url, data=auth, timeout=TIMEOUT) # TODO(LuHa): query to daily rank # rank start url: # https://www.pixiv.net/ranking.php?mode=daily&date=20070913 for tag in tags: base_url = 'https://www.pixiv.net/' page_url = 'ranking.php' + tag request_url = base_url + page_url if tag.endswith('date='): request_url = request_url + get_random_date() print( '\x1B[38;5;5m[Pixiv] Request: {0}\x1B[0m'.format(request_url)) response = opener.open(request_url, timeout=TIMEOUT) # TODO(LuHa): get page uri image_page_parser = ImagePageParser() try: image_page_parser.feed(response.read().decode('utf-8')) except socket.timeout: print('\x1B[38;5;5m[Pixiv] Response timeout\x1B[0m') return # TODO(LuHa): get image uri, but remain multiple page image_url_parser = ImageURLParser() for image_page in image_page_parser.get_pages(): request_url = base_url + image_page response = opener.open(request_url, timeout=TIMEOUT) try: image_url_parser.feed(response.read().decode('utf-8')) except socket.timeout: print('\x1B[38;5;5m[Pixiv] Response timeout\x1B[0m') return #print('[P] image url ready {0}'.format(len(image_url_parser.get_urls()))) print('[Pixiv] Get ranking page') # TODO(LuHa): get multiple image uri image_urls = image_url_parser.get_urls() multi_page_parser = MultiPageParser() multi_url_parser = MultiURLParser() final_urls = list() for image_url in image_urls: #print('[P] final URL ready {0}'.format(len(final_urls))) if image_url.startswith('https://'): final_urls.append(image_url) continue multi_url_parser.clear_urls() multi_page_parser.clear_pages() request_url = 'https://www.pixiv.net/' + image_url response = opener.open(request_url, timeout=TIMEOUT) try: multi_page_parser.feed(response.read().decode('utf-8')) except socket.timeout: print('\x1B[38;5;5m[Pixiv] Response timeout\x1B[0m') return for multi_page in multi_page_parser.get_pages(): request_url = 'https://www.pixiv.net' + multi_page response = opener.open(request_url, timeout=TIMEOUT) try: multi_url_parser.feed(response.read().decode('utf-8')) except socket.timeout: print('\x1B[38;5;5m[Pixiv] Response timeout\x1B[0m') return final_urls.extend(multi_url_parser.get_urls()) print('[Pixiv] Get URLs of all images in ranking') # TODO(LuHa): download image for image_url in final_urls: image_id = image_url.split('/')[-1] if image_id in downloaded: print('[Pixiv] Already downloaded {0}'.format(image_id)) continue elif image_id in ban_db['pixiv']: print('[Pixiv] Ban downloaded {0}'.format(image_id)) continue elif image_id in mute_db['pixiv']: print('[Pixiv] Mute downloaded {0}'.format(image_id)) continue else: downloaded.add(image_id) file_name = ('./downloads' + '/pixiv-' + image_url.split('/')[-1]) with open(file_name, 'wb') as f: referer = 'https://www.pixiv.net/member_illust.php' referer = referer + '?mode=medium&illust_id=' referer = referer + file_name.split('_')[0] opener.addheaders = [('User-agent', 'Mozilla/5.0'), ('Referer', referer)] response = opener.open(image_url, timeout=TIMEOUT) try: f.write(response.read()) except socket.timeout: print('\x1B[38;5;5m[Pixiv] Response timeout\x1B[0m') return print('[Pixiv] Downloaded {0}'.format(file_name)) # sleep for prevent blocking utils.dynamic_sleep() except KeyboardInterrupt: print('[Pixiv] Keyboard Interrupt') except Exception as e: print('[Pixiv] Some Interrupt', e) # TODO(LuHa): save cookie to file cookie_jar.save() # TODO(LuHa): print message about program termination print('\x1B[38;5;5m[Pixiv] Terminate pixiv downloader\x1B[0m')
from flask import Flask, request, jsonify, render_template, url_for from flask import session from flask_googleauth import GoogleAuth import random import urlparse import twilio.twiml from twilio import TwilioRestException from utils import load_data, set_trace from models import log_call, aggregate_stats, valid_users from utils import get_database, play_or_say, locate_member_ids app = Flask(__name__) app.config.from_object('config.ConfigProduction') app.secret_key = app.config['SECRET_KEY'] db = get_database(app) auth = GoogleAuth(app) call_methods = ['GET', 'POST'] campaigns, legislators, districts = load_data() defaults_campaign = campaigns['default'] def full_url_for(route, **kwds): return urlparse.urljoin(app.config['APPLICATION_ROOT'], url_for(route, **kwds)) def get_campaign(cid): return dict(defaults_campaign, **campaigns[cid])
def main(): queue = get_message_queue(sys.argv[1], 'wechat_official_account_content_queue') firefoxProfile = FirefoxProfile() # firefoxProfile.set_preference('permissions.default.stylesheet', 2) firefoxProfile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') firefoxProfile.set_preference('permissions.default.image', 2) driver = webdriver.Firefox(firefoxProfile) # driver = webdriver.PhantomJS(service_args=['--load-images=false']) print('Driver is ready') driver.implicitly_wait(10) # driver.set_page_load_timeout(30) db = get_database(sys.argv[1]).wechat_article_list while 1: if queue.empty(): print('Already finished') print('Waiting for new query ...') #wechat_id = queue.get().decode() wechat_id = 'HIT_SCIR' print('Crawling %s' % wechat_id) url = 'http://www.newrank.cn/public/info/detail.html?account=%s' % wechat_id try: driver.get(url) except: print('!!!!!!!!!!!!!Cannot get web page!!!!!!!!') time.sleep(3) continue if (driver.title == u'页面错误'): print('%s not included' % (wechat_id)) continue locator = (By.XPATH, '//*[@id="info_detail_article_lastest"]//li') try: WebDriverWait(driver, 20, 0.5).until(EC.presence_of_element_located(locator)) elements = driver.find_elements_by_xpath( '//*[@id="info_detail_article_lastest"]//li') data = dict() data['str_id'] = wechat_id data['article_list'] = [] for e in elements: article = dict() article['title'] = e.find_element_by_class_name( 'ellipsis').get_attribute('title') article['href'] = e.find_element_by_class_name( 'ellipsis').get_attribute('href') article['short_text'] = e.find_element_by_class_name( 'article-text').find_element_by_tag_name( 'a').get_attribute('title') article['date'] = e.find_element_by_class_name( 'info-detail-article-date').text article['read_count'] = e.find_element_by_class_name( 'read-count').text article['like_count'] = e.find_element_by_class_name( 'links-count').text article['position'] = e.find_element_by_class_name( 'tj').find_elements_by_tag_name('span')[1].text data['article_list'].append(article) assert len(data['article_list']) > 0 record = db.find_one({'str_id': wechat_id}) if record is None: print('Not find %s in database' % wechat_id) db.insert(data) else: print(len(record['article_list'])) for article in data['article_list']: if article not in record['article_list']: record['article_list'].append(article) print(len(record['article_list'])) db.replace_one({'str_id': wechat_id}, record) except Exception as e: print('Error') print(e) with open('./fail_ids_for_article_urls.data', 'a') as fout: fout.write('%s\n' % wechat_id) print('Not find id "info_detail_article_lastest" when crawl %s' % wechat_id) sleep_time = random.uniform(1, 3) time.sleep(sleep_time)
def main(): # General parameters net = ['base', 'cifar', 'emb+soft', 'resnet50', 'resnet20', 'local_feat'][0] database = ['cifar10', 'mnist', 'fashion_mnist', 'skillup'][1] epochs = 10 learn_rate = 0.01 decay = (learn_rate / epochs) * 0.8 ims_per_id = 8 ids_per_batch = 8 margin = 0.9 embedding_size = 64 squared = False data_augmentation = False patience = 25 # built model's parameters dropout = 0.3 blocks = 3 n_channels = 32 weight_decay = 1e-4 * 0 # dataloader parameters use_dataloader = True path = '/home/daniel/proyectos/product_detection/web_market_preproces/duke_from_images' exp_dir, log_dir, model_weights_path, model_name = get_dirs(database) tl_object = TripletLoss(ims_per_id=ims_per_id, ids_per_batch=ids_per_batch, margin=margin, squared=squared) tl_h = TripletLoss(ims_per_id, ids_per_batch, margin, squared) opt = optimizers.Adam(lr=learn_rate, decay=decay) data, input_size = get_database(database) im_size = input_size[:2] data_gen_args_train = dict( featurewise_center=False, # set input mean to 0 over the dataset samplewise_center=False, # set each sample mean to 0 featurewise_std_normalization= False, # divide inputs by std of the dataset samplewise_std_normalization=False, # divide each input by its std zca_whitening=False, # apply ZCA whitening rotation_range= 10, # randomly rotate images in the range (degrees, 0 to 180) zoom_range=0.1, # Randomly zoom image width_shift_range= 0.1, # randomly shift images horizontally (fraction of total width) height_shift_range= 0.1, # randomly shift images vertically (fraction of total height) horizontal_flip=False, # randomly flip images vertical_flip=False) if not data_augmentation: data_gen_args_train = {} model_args = dict(embedding_dim=embedding_size, input_shape=input_size, drop=dropout, blocks=blocks, n_channels=n_channels, weight_decay=weight_decay, layer_limit=173, patience=patience) data_loader_args = dict(path=path, ims_per_id=ims_per_id, ids_per_batch=ids_per_batch, target_image_size=im_size, data_gen_args=data_gen_args_train, preprocess_unit=True, data=data) if database == 'skillup': dl = FileDataloader(**data_loader_args) else: dl = StaticDataloader(**data_loader_args) model = get_net_object(net, model_args) model.compile(opt, tl_object.cluster_loss) if use_dataloader: model.train_generator(dl, model_weights_path, epochs, log_dir) else: model.train(data, model_weights_path, epochs, ims_per_id * ids_per_batch, log_dir) model.save_model(model_weights_path) visualize_embeddings(database=database, model_dir=exp_dir, model_name=model_name, model=model.model)
def main(argv): """ main flow """ # TODO(LuHa): print message about program execution utils.logger.info( '\x1B[38;5;5m[Wallhaven] Execute wallhaven downloader\x1B[0m') # TODO(LuHa): create downloads directory # actually, this code use only downloads directory. # but to ensure execution of source code, # make save directory. os.makedirs('./downloads', exist_ok = True) os.makedirs('./save', exist_ok = True) # TODO(LuHa): load ban database ban_db = utils.get_database('ban.secret') # TODO(LuHa): load mute database mute_db = utils.get_database('mute.secret') # TODO(LuHa): read pre-downloaded image downloaded = utils.get_downloaded_images('wallhaven') # TODO(LuHa): load tags if os.path.exists('tags.secret'): with open('tags.secret', 'r') as f_tags: tags = json.load(f_tags) tags = tags['wallhaven'] else: utils.logger.error('[Wallhaven] Need tags in file named tags.secret') return # TODO(LuHa): load API keys if os.path.exists('wallhaven_api.secret'): print('[Wallhaven] API key exists') with open('wallhaven_api.secret', 'r') as f_api: api_key = json.load(f_api) user_id = api_key['id'].strip() user_passwd = api_key['passwd'].strip() else: print('[Wallhaven] Need User id and passwd file ' + 'named wallhaven_api.secret') print('[Wallhaven] The format is below') print('{') print(' "id": "ID",') print(' "passwd": "PASSWD"') print('}') return # TODO(LuHa): load cookie from file cookie_jar = http.cookiejar.LWPCookieJar('wallhaven_cookie.secret') if os.path.exists('wallhaven_cookie.secret'): cookie_jar.load() cookie = urllib.request.HTTPCookieProcessor(cookie_jar) # TODO(LuHa): create opener opener = urllib.request.build_opener(cookie) opener.addheaders = [('User-agent', 'Mozilla/5.0'), ('Accept', 'text/html')] # TODO(LuHa): check logined or not logined request_url = 'https://alpha.wallhaven.cc/auth/login' response = opener.open(request_url, timeout = TIMEOUT) login_parser = LoginParser() try: login_parser.feed(response.read().decode('utf-8')) except socket.timeout: print('\x1B[38;5;5m[Wallhaven] Response timeout\x1B[0m') return # TODO(LuHa): if the cookie is not login, login with cookie try: if login_parser.get_logined() == False: request_url = 'https://alpha.wallhaven.cc/auth/login' auth = {'username': user_id, 'password': user_passwd} auth = urllib.parse.urlencode(auth) auth = auth.encode('ascii') opener.open(request_url, data = auth) # TODO(LuHa): loop search by tags base_url = 'https://alpha.wallhaven.cc/search' max_page_parser = MaxPageParser() id_parser = ImageIdParser() uri_parser = ImageURIParser() # for fun random.shuffle(tags) for tag in tags: base_url = 'https://alpha.wallhaven.cc/search' max_page_parser.clear_data() id_parser.clear_ids() uri_parser.clear_uris() # TODO(LuHa): get max page opener.addheaders = [('User-agent', 'Mozilla/5.0'), ('Accept', 'text/html')] request_url = base_url + tag print('\x1B[38;5;5m[Wallhaven] Request: {0}\x1B[0m'.format(request_url)) response = opener.open(request_url, timeout = TIMEOUT) try: max_page_parser.feed(response.read().decode('utf-8')) except socket.timeout: print('\x1B[38;5;5m[Wallhaven] Response timeout\x1B[0m') return max_page = max_page_parser.get_data() max_page = max_page.split() if len(max_page) > 3: max_page = int(max_page[3]) else: max_page = 1 # TODO(LuHa): get image id random_page = random.randint(1, max_page) random_page = '&page=' + str(random_page) request_url = base_url + tag + random_page response = opener.open(request_url, timeout = TIMEOUT) try: id_parser.feed(response.read().decode('utf-8')) except socket.timeout: print('\x1B[38;5;5m[Wallhaven] Response timeout\x1B[0m') return # TODO(LuHa): loop parse image path # get 24 images at one time in wallhaven print('[Wallhaven] Search image path') for image_id in id_parser.get_ids(): # skip target image is already downloaded if image_id in downloaded: print('[Wallhaven] Already downloaded {0}'.format(image_id)) continue elif image_id in ban_db['wallhaven']: print('[Wallhaven] Ban downloaded {0}'.format(image_id)) continue elif image_id in mute_db['wallhaven']: print('[Wallhaven] Mute downloaded {0}'.format(image_id)) continue else: downloaded.add(image_id) base_url = 'https://alpha.wallhaven.cc/' request_url = (base_url + 'wallpaper/' + image_id) response = opener.open(request_url, timeout = TIMEOUT) try: uri_parser.feed(response.read().decode('utf-8')) except socket.timeout: print('\x1B[38;5;5m[Wallhaven] Response timeout\x1B[0m') return # sleep for prevent blocking utils.dynamic_sleep() # TODO(LuHa): loop download by posts opener.addheaders = [('User-agent', 'Mozilla/5.0')] for image_uri in uri_parser.get_uris(): request_url = ('https:' + image_uri) response = opener.open(request_url, timeout = TIMEOUT) image_path = ('./downloads/' + image_uri.split('/')[-1]) with open(image_path, 'wb') as f: try: f.write(response.read()) except socket.timeout: print('\x1B[38;5;5m[Wallhaven] Response timeout\x1B[0m') return print('[Wallhaven] Downloaded {0}'.format(image_path)) # sleep for prevent blocking utils.dynamic_sleep() except KeyboardInterrupt: print('[Wallhaven] keyboard Interrupt') except Exception as e: print('[Wallhaven] Some Interrupt', e) # TODO(LuHa): save cookie cookie_jar.save() # TODO(Luha): print message about program termination utils.logger.info( '\x1B[38;5;5m[Wallhaven] Terminate wallhaven downloader\x1B[0m')
def school_list(): return map(_add_weekly_report_link, utils.get_database().schools.find())