def main(): args = get_args() config = utils.get_config(args.config) logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) session = http_session.StorageSession(**config['session'], access_key=utils.get_access_token()) root_dir = config['data']['root_dir'] raw_path = utils.build_path(root_dir=root_dir, sub_dir=args.raw, date=args.date, ext='json') data = download_data(session, path=raw_path) rows = parse_data(data) LOGGER.info("Retrieved %s rows", len(rows)) headers = utils.get_headers(config['fields']) rows = transform.clean(rows, data_types=headers, date=args.date) output_path = utils.build_path(root_dir=root_dir, sub_dir=args.output, date=args.date, ext='csv') utils.write_csv(path=output_path, rows=rows, header=args.header)
def __init__(self, name=None, key_resource_id=None, extra_resources=None, path=None, lang="en"): self.key_resource_id = urljoin(BASE_URL, key_resource_id.strip()) self.filename = hashlib.sha1(name.encode("utf-8")).hexdigest() self.title = name if len(name) < 80 else name[:80] self.path_levels = path self.lang = lang self.file = None self.video = None self.ids = set([]) LOGGER.info("Collecting: {}".format(self.key_resource_id)) LOGGER.info(" - Name: {}".format(self.title)) LOGGER.info(" - Lang: {}".format(self.lang)) self.html = HTMLLesson(source_id=self.key_resource_id, name=self.title, lang=self.lang) if self.path_levels[-1] is None: self.base_path = build_path([DATA_DIR] + self.path_levels[:-1] + [self.filename]) else: self.base_path = build_path([DATA_DIR] + self.path_levels + [self.filename]) if extra_resources is not None: LOGGER.info(" - Extra resources: {}".format( len(extra_resources))) self.set_extra_resources(extra_resources)
def __init__(self): build_path([TESSIndiaChef.TREES_DATA_DIR]) self.scrape_stage = os.path.join( TESSIndiaChef.TREES_DATA_DIR, TESSIndiaChef.SCRAPING_STAGE_OUTPUT_TPL) self.crawling_stage = os.path.join( TESSIndiaChef.TREES_DATA_DIR, TESSIndiaChef.CRAWLING_STAGE_OUTPUT_TPL) super(TESSIndiaChef, self).__init__()
def pre_run(self, args, options): build_path([FolkDCChef.TREES_DATA_DIR]) self.download_css_js() self.lang = options.get('--lang', "en") self.RICECOOKER_JSON_TREE = FolkDCChef.SCRAPING_STAGE_OUTPUT_TPL.format( lang=self.lang) self.scrape_stage = os.path.join(FolkDCChef.TREES_DATA_DIR, self.RICECOOKER_JSON_TREE) channel_tree = self.scrape(args, options) self.write_tree_to_json(channel_tree)
def scrape(self, args, options): download_video = options.get('--download-video', "1") load_video_list = options.get('--load-video-list', "0") if int(download_video) == 0: global DOWNLOAD_VIDEOS DOWNLOAD_VIDEOS = False if int(load_video_list) == 1: global LOAD_VIDEO_LIST LOAD_VIDEO_LIST = True global CHANNEL_SOURCE_ID self.RICECOOKER_JSON_TREE = 'ricecooker_json_tree.json' channel_tree = dict( source_domain=CHANNEL_DOMAIN, source_id=CHANNEL_SOURCE_ID, title=CHANNEL_NAME, description= CHANNEL_DESCRIPTION[:400], #400 UPPER LIMIT characters allowed thumbnail=CHANNEL_THUMBNAIL, author=AUTHOR, language=CHANNEL_LANGUAGE, children=[], license=LICENSE, ) grades = GradeJsonTree(subject_node=SubjectNode) grades.load("resources.json", auto_parse=True, author=AUTHOR, license=LICENSE, save_url_to=build_path([DATA_DIR, CHANNEL_SOURCE_ID]), load_video_list=load_video_list) base_path = [DATA_DIR] base_path = build_path(base_path) for grade in grades: for subject in grade.subjects: for lesson in subject.lessons: video = lesson.download(download=DOWNLOAD_VIDEOS, base_path=base_path) lesson.add_node(video) subject.add_node(lesson) grade.add_node(subject) channel_tree["children"].append(grade.to_dict()) return channel_tree
def scrape(self, args, options): download_video = options.get('--download-video', "1") load_video_list = options.get('--load-video-list', "0") if int(download_video) == 0: global DOWNLOAD_VIDEOS DOWNLOAD_VIDEOS = False if int(load_video_list) == 1: global LOAD_VIDEO_LIST LOAD_VIDEO_LIST = True channel_tree = self.lessons() base_path = [DATA_DIR] + ["data"] base_path = build_path(base_path) vocabulary = VocabularyConversationalEnglish() for unit in vocabulary.auto_generate_units(BASE_URL, base_path): vocabulary.add_node(unit) english_grammar = EnglishGrammar() for unit in english_grammar.auto_generate_units(base_path): english_grammar.add_node(unit) channel_tree["children"].append(english_grammar.to_node()) channel_tree["children"].append(vocabulary.to_node()) return channel_tree
def download(self, download=True, base_path=None): if not "watch?" in self.source_id or "/user/" in self.source_id or\ download is False: return download_to = build_path([base_path, 'videos']) for i in range(4): try: info = self.get_video_info(download_to=download_to, subtitles=False) if info is not None: LOGGER.info(" + Video resolution: {}x{}".format(info.get("width", ""), info.get("height", ""))) self.filepath = os.path.join(download_to, "{}.mp4".format(info["id"])) self.filename = info["title"] if self.filepath is not None and os.stat(self.filepath).st_size == 0: LOGGER.info(" + Empty file") self.filepath = None except (ValueError, IOError, OSError, URLError, ConnectionResetError) as e: LOGGER.info(e) LOGGER.info("Download retry") time.sleep(.8) except (youtube_dl.utils.DownloadError, youtube_dl.utils.ContentTooShortError, youtube_dl.utils.ExtractorError, OSError) as e: LOGGER.info(" + An error ocurred, may be the video is not available.") return except OSError: return else: return
def process(cmd, file_name, content): """ Handler to process incoming commands cmd: command name file_name: file name to be processed by command content: content of the file in case of create command """ logger.info('command: {}, file_name: {}, content: {}' .format(cmd, file_name, content)) try: utils.check_working_dir() cmd_to_execute = COMMANDS.get(cmd, default_cmd) file_path = utils.build_path(file_name) if content: result = cmd_to_execute(file_path, content) else: result = cmd_to_execute(file_path) if result: logger.info('result = {}'.format(result)) except CreateFileException as e: logger.error('CreateFileException: {}'.format(e.message)) except ReadFileException as e: logger.error('ReadFileException: {}'.format(e.message)) except RemoveFileException as e: logger.error('RemoveFileException: {}'.format(e.message)) except GetMetaDataException as e: logger.error('GetMetaDataException: {}'.format(e.message))
def scrape(self, args, options): download_video = options.get('--download-video', "1") basic_lessons = int(options.get('--basic-lessons', "0")) intermedian_lessons = int(options.get('--intermedian-lessons', "0")) load_video_list = options.get('--load-video-list', "0") if int(download_video) == 0: global DOWNLOAD_VIDEOS DOWNLOAD_VIDEOS = False if int(load_video_list) == 1: global LOAD_VIDEO_LIST LOAD_VIDEO_LIST = True global channel_tree if basic_lessons == 1: channel_tree, subjects = self.k12_lessons() elif intermedian_lessons == 1: channel_tree, subjects = self.intermediate_lessons() else: channel_tree, subjects = self.k12_lessons() base_path = [DATA_DIR] + ["King Khaled University in Abha"] base_path = build_path(base_path) for subject in subjects: for topic in subject.topics: for unit in topic.units: unit.download(download=DOWNLOAD_VIDEOS, base_path=base_path) topic.add_node(unit) subject.add_node(topic) channel_tree["children"].append(subject.to_node()) return channel_tree
def write_videos(self, from_i=0, to_i=None): path = [DATA_DIR] + ["abdullah_videos"] path = build_path(path) for section in self.get_sections(from_i=from_i, to_i=to_i): LOGGER.info("* Section: {}".format(section.title)) section.download(download=DOWNLOAD_VIDEOS, base_path=path) yield section.to_node()
def newProject(args=None): ''' >>> p = newProject() >>> p['message'] 'Project has been created' ''' filename = _check_project_index() with open(filename, 'r') as fp: pindexes = json.load(fp) counter = pindexes['counter'] + 1 name = 'project-%d' % counter path = os.path.join(project_base_path, name) if os.path.exists(path): logging.warning('Project path %s has been exists', path) else: logging.info('Make project path %s', path) os.mkdir(path) args = ['init', '--src', path, path] _pyarmor(args) pindexes['projects'][name] = os.path.abspath(path) pindexes['counter'] = counter with open(filename, 'w') as fp: json.dump(pindexes, fp) project = Project() project.open(path) project['name'] = name project['title'] = name project['output'] = build_path('dist', path) return dict(project=project, message='Project has been created')
def scrape(self, args, options): run_test = bool(int(options.get('--test', "0"))) global channel_tree channel_tree = dict( source_domain=FolkDCChef.BASE_URL, source_id=CHANNEL_SOURCE_ID + "-" + self.lang, title="{} ({})".format(CHANNEL_NAME, self.lang), description= """Digital Children's Folksongs for Language and Cultural Learning: a collection of multi-language folk songs and activities for primary students to learn languages, engage in collaboration and critical thinking, and develop intercultural skills. Contains folk songs, activity suggestions, and teacher training materials."""[: 400], #400 UPPER LIMIT characters allowed thumbnail=CHANNEL_THUMBNAIL, author=AUTHOR, language=self.lang, children=[], license=LICENSE, ) if run_test is True: return test(channel_tree) else: resources = Resource(lang=self.lang) resources.load("resources.json") for resource in resources: base_path = build_path( [DATA_DIR, resource.lang, resource.cls_name()]) resource.to_file(base_path) node = resource.to_dict() if node is not None: channel_tree["children"].append(node) return channel_tree
def build_pdfs_nodes(self, urls, base_path): base_path = build_path([base_path, 'pdfs']) pdf_nodes = [] for pdf_url in urls: pdf_file = File(source_id=pdf_url, lang=self.lang, title=self.title) pdf_file.download(download=DOWNLOAD_FILES, base_path=base_path) pdf_nodes.append(pdf_file) return pdf_nodes
def _generate_resource_config(api_info, tag_info, custom_configs): msg_prefix = {} for i in ["create", "update", "get"]: s = api_info.get(i, {}).get("msg_prefix", None) if s: msg_prefix[i] = s create_api = api_info["create"]["api"] rn = tag_info["name"] if custom_configs: rn = custom_configs.get("resource_name", rn) if isinstance(rn, unicode): raise Exception("Must config resouce_name in English, " "because the tag is Chinese") data = { "name": rn[0].upper() + rn[1:].lower(), "service_type": create_api["service_type"], "base_url": build_path(create_api["path"]), "msg_prefix": msg_prefix, "description": tag_info.get("description", ""), "create_verb": api_info["create"]["create_verb"], } if "update" in api_info: data["update_verb"] = build_path(api_info["update"]["update_verb"]) if "list" in api_info: info = api_info["list"] if "identity" not in info: raise Exception("Must config identity for list operation") api = info["api"] v = { "path": build_path(api["path"]), "identity": [{ "name": i } for i in info["identity"]] } v["query_params"] = [{"name": i["name"]} for i in api["query_params"]] if "msg_prefix" in info: v["msg_prefix"] = info["msg_prefix"] data["list_info"] = v return pystache.Renderer().render_path("template/resource.mustache", data)
def articles(self): for article_tag in self.soup.find_all("div", class_="views-row"): title = article_tag.find("div", class_="views-field-title") url = urljoin(BASE_URL, title.find("a").get("href")) article = Article(title.text, url) article.thumbnail = article_tag.find("img").get("src") base_path = build_path([DATA_DIR, self.topic.title, article.title]) if article.to_file(base_path) is True: yield article.to_node()
def store_raw_data(sampling_feature, date, directory, data): suffix = sampling_feature.rpartition('/')[2] path = utils.build_path(date=date, ext='xml', directory=directory, suffix=suffix) # Serialise with open(path, 'w') as file: file.write(data) LOGGER.info("Wrote '%s'", file.name)
def build_pdfs_nodes(self, base_path, content): pdfs_url = self.get_pdfs_urls(content) base_path = build_path([base_path, 'pdfs']) for pdf_url in pdfs_url: pdf_file = File(source_id=pdf_url, lang=self.lang, title=self.title) pdf_file.download(download=DOWNLOAD_FILES, base_path=base_path) yield pdf_file
def build_audio_nodes(self, base_path, content): audio_urls = self.get_audio_urls(content) base_path = build_path([base_path, 'audio']) for audio_url in audio_urls: audio_file = Audio(source_id=audio_url, lang=self.lang, title=self.title) audio_file.download(download=DOWNLOAD_AUDIO, base_path=base_path) yield audio_file
def build_video_nodes(self, base_path, content): videos_url = self.get_videos_urls(content) base_path = build_path([DATA_DIR]) video_nodes = [] for video_url in videos_url: if YouTubeResource.is_youtube( video_url) and not YouTubeResource.is_channel(video_url): video = YouTubeResourceNode(video_url, lang=self.lang) video.download(download=DOWNLOAD_VIDEOS, base_path=base_path) yield video
def build_pdfs_nodes(self, base_path, content): pdfs_urls = self.get_pdfs_urls(content) base_path = build_path([base_path, 'pdfs']) pdf_nodes = [] for pdf_url in pdfs_urls: pdf_file = File(pdf_url, lang=self.lang, name=self.title) pdf_file.download(download=DOWNLOAD_FILES, base_path=base_path) node = pdf_file.to_node() if node is not None: pdf_nodes.append(node) return pdf_nodes
def _build(args): '''Build project, obfuscate all files in the project''' project = Project() project.open(args.project) logging.info('Build project %s ...', args.project) capsule = build_path(project.capsule, args.project) if not args.only_runtime: output = project.output mode = project.get_obfuscate_mode() files = project.get_build_files(args.force) src = project.src filepairs = [(os.path.join(src, x), os.path.join(output, x)) for x in files] logging.info('%s increment build', 'Disable' if args.force else 'Enable') logging.info('Search scripts from %s', src) logging.info('Obfuscate %d scripts with mode %s', len(files), mode) for x in files: logging.info('\t%s', x) logging.info('Save obfuscated scripts to %s', output) obfuscate_scripts(filepairs, mode, capsule, output) # for x in targets: # output = os.path.join(project.output, x) # pairs = [(os.path.join(src, x), os.path.join(output, x)) # for x in files] # for src, dst in pairs: # try: # shutil.copy2(src, dst) # except Exception: # os.makedirs(os.path.dirname(dst)) # shutil.copy2(src, dst) project['build_time'] = time.time() project.save(args.project) if not args.no_runtime: logging.info('Make runtime files') make_runtime(capsule, output) if project.entry: for x in project.entry.split(','): filename = os.path.join(output, x.strip()) logging.info('Update entry script %s', filename) make_entry(filename, project.runtime_path) else: logging.info('\tIn order to import obfuscated scripts, insert ') logging.info('\t2 lines in entry script:') logging.info('\t\tfrom pytransfrom import pyarmor_runtime') logging.info('\t\tpyarmor_runtime()') logging.info('Build project OK.')
def build_video_nodes(self, base_path, content): videos_url = self.get_videos_urls(content) base_path = build_path([DATA_DIR, "videos"]) video_nodes = [] for video_url in videos_url: if YouTubeResource.is_youtube(video_url): video = YouTubeResource(video_url, lang=self.lang) video.download(download=DOWNLOAD_VIDEOS, base_path=base_path) node = video.to_node() if node is not None: video_nodes.append(node) return video_nodes
def test_build_path_with_lines(self): fun_name = 'test_build_path_with_lines' lines = True correct = 'HP' + str(self.valid_booknum) + '/hp' + str( self.valid_booknum) + '_' + str(self.valid_pagenum) + '_lines.png' try: assert correct == utils.build_path(self.valid_booknum, self.valid_pagenum, lines) print(fun_name + ' ' + SUCCESS) return 1 except AssertionError: print(fun_name + ' ' + FAILED) return 0
def playlist_name_links(self): name_url = [] source_id_hash = hashlib.sha1(self.source_id.encode("utf-8")).hexdigest() base_path = build_path([DATA_DIR, CHANNEL_SOURCE_ID]) videos_url_path = os.path.join(base_path, "{}.json".format(source_id_hash)) if file_exists(videos_url_path) and LOAD_VIDEO_LIST is True: with open(videos_url_path, "r") as f: name_url = json.load(f) else: for url in self.playlist_links(): youtube = YouTubeResourceNode(url) info = youtube.get_resource_info() name_url.append((info["title"], url)) with open(videos_url_path, "w") as f: json.dump(name_url, f) return name_url
def save_thumbnail(url, title): import imghdr from io import BytesIO try: r = requests.get(url) except: return None else: img_buffer = BytesIO(r.content) img_ext = imghdr.what(img_buffer) if img_ext != "gif": filename = "{}.{}".format(title, img_ext) base_dir = build_path([DATA_DIR, DATA_DIR_SUBJECT, "thumbnails"]) filepath = os.path.join(base_dir, filename) with open(filepath, "wb") as f: f.write(img_buffer.read()) return filepath
def write_video(self, base_path, content): videos = content.find_all( lambda tag: tag.name == "a" and tag.attrs.get("href", "").find( "youtube") != -1 or tag.attrs.get("href", "").find( "youtu.be") != -1 or tag.text.lower() == "youtube") VIDEOS_DATA_DIR = build_path([base_path, 'videos']) for video in videos: youtube = YouTubeResource(video.get("href", ""), lang=self.lang) node = get_node_from_channel(youtube.resource_url, channel_tree) if node is None: youtube.to_file(filepath=VIDEOS_DATA_DIR) node = youtube.node if node is not None: if video.parent.name == 'li': video.parent.replace_with("Video name: " + node["title"]) if node["source_id"] not in self.ids: self.nodes.append(node) self.ids.add(node["source_id"])
def updateProject(args): ''' >>> p = newProject()['project'] >>> updateProject(title='My Project') 'Update project OK' ''' name = args['name'] path = os.path.join(project_base_path, name) project = Project() project.open(path) if args['output']: args['output'] = build_path(args['output'], path) else: args['output'] = os.path.join(path, 'dist') project._update(args) project.save(path) return 'Update project OK'
def scrape(self, args, options): download_video = options.get('--download-video', "1") load_video_list = options.get('--load-video-list', "0") if int(download_video) == 0: global DOWNLOAD_VIDEOS DOWNLOAD_VIDEOS = False if int(load_video_list) == 1: global LOAD_VIDEO_LIST LOAD_VIDEO_LIST = True global channel_tree channel_tree, grades = self.lessons() base_path = [DATA_DIR] base_path = build_path(base_path) for subject in grades: for lesson in subject.lessons: lesson.download(download=DOWNLOAD_VIDEOS, base_path=base_path) channel_tree["children"].append(lesson.to_node()) return channel_tree
def download(self, base_path): PDFS_DATA_DIR = build_path([base_path, 'pdfs']) try: response = sess.get(self.source_id) content_type = response.headers.get('content-type') if 'application/pdf' in content_type: self.filepath = os.path.join(PDFS_DATA_DIR, self.filename) with open(self.filepath, 'wb') as f: for chunk in response.iter_content(10000): f.write(chunk) LOGGER.info(" - Get file: {}, node name: {}".format( self.filename, self.name)) except requests.exceptions.HTTPError as e: LOGGER.info("Error: {}".format(e)) except requests.exceptions.ConnectionError: ### this is a weird error, may be it's raised when the webpage ### is slow to respond requested resources LOGGER.info( "Connection error, the resource will be scraped in 5s...") time.sleep(3) except requests.exceptions.ReadTimeout as e: LOGGER.info("Error: {}".format(e)) except requests.exceptions.TooManyRedirects as e: LOGGER.info("Error: {}".format(e))
def train(lr, w, l2_reg, epoch, batch_size, model_type, num_layers, data_type, word2vec, num_classes=2): if data_type == "WikiQA": train_data = WikiQA(word2vec=word2vec) elif data_type == "MSRP": train_data = MSRP(word2vec=word2vec) train_data.open_file(mode="train") print("=" * 50) print("training data size:", train_data.data_size) print("training max len:", train_data.max_len) print("=" * 50) model = ABCNN(s=train_data.max_len, w=w, l2_reg=l2_reg, model_type=model_type, num_features=train_data.num_features, num_classes=num_classes, num_layers=num_layers) optimizer = tf.train.AdagradOptimizer(lr, name="optimizer").minimize( model.cost) # Due to GTX 970 memory issues #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4) init = tf.global_variables_initializer() # keep no more than 100 models saver = tf.train.Saver(max_to_keep=100) session_config = tf.ConfigProto(allow_soft_placement=True) session_config.gpu_options.allow_growth = True with tf.Session(config=session_config) as sess: #with tf.Session() as sess: train_summary_writer = tf.summary.FileWriter("C:/tf_logs/train", sess.graph) sess.run(init) print("=" * 50) for e in range(1, epoch + 1): print("[Epoch " + str(e) + "]") train_data.reset_index() i = 0 clf_features = [] while train_data.is_available(): i += 1 batch_x1, batch_x2, batch_y, batch_features = train_data.next_batch( batch_size=batch_size) merged, _, c, features = sess.run( [ model.merged, optimizer, model.cost, model.output_features ], feed_dict={ model.x1: batch_x1, model.x2: batch_x2, model.y: batch_y, model.features: batch_features }) clf_features.append(features) if i % 100 == 0: print("[batch " + str(i) + "] cost:", c) train_summary_writer.add_summary(merged, i) save_path = saver.save(sess, build_path("./models/", data_type, model_type, num_layers), global_step=e) print("model saved as", save_path) clf_features = np.concatenate(clf_features) print("training finished!") print("=" * 50)