def __init__(self, cache_dir, categories, keywords, amount=None): self.categories = categories self.keywords = keywords cache_dir = os.path.expanduser(cache_dir) ensure_directory(cache_dir) filename = os.path.join(cache_dir, 'abstracts.txt') if not os.path.isfile(filename): with open(filename, 'w') as file_: for abstract in self._fetch_all(amount): file_.write(abstract + '\n') with open(filename) as file_: self.data = file_.readlines()
def _init_or_load_session(self): self.sess = tf.Session() self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(self.params.checkpoint_dir) if checkpoint and checkpoint.model_checkpoint_path: path = checkpoint.model_checkpoint_path print('Load checkpoint', path) self.saver.restore(self.sess, path) self.epoch = int(re.search(r'-(\d+)$', path).group(1)) + 1 else: ensure_directory(self.params.checkpoint_dir) print('Randomly initialize variables') self.sess.run(tf.initialize_all_variables()) self.epoch = 1
def main(): logging.basicConfig(filename='process.log', level=logging.INFO) stream = open('./output/index.yaml') index = safe_load(stream) total_pages = reduce(lambda acc, ch: acc + len(ch['sections']) + 1, index, 0) tasks = multiprocessing.JoinableQueue() results = multiprocessing.Queue() num_consumers = multiprocessing.cpu_count() * 2 logging.info("Creating %s consumers" % num_consumers) consumers = [ Converter(tasks, results) for i in range(num_consumers) ] bar = Bar('Downloading using %i consumers' % num_consumers, max=total_pages) ensure_directory('./output/pdf') errors_counter = 0 for w in consumers: w.start() for chapter_number, chapter in enumerate(index): chapter_directory = './output/pdf/%s - %s' % (chapter['number'], chapter['title']) ensure_directory(chapter_directory) if 'pdf' in chapter: bar.next() total_pages -= 1 elif 'error' in chapter: bar.next() total_pages -= 1 errors_counter += 1 else: tasks.put({ 'title': chapter['title'], 'path': '%s/%s.pdf' % (chapter_directory, chapter['title'].replace('/', '-')), 'link': chapter['link'], 'chapter': chapter_number }) for section_number, section in enumerate(chapter['sections']): if 'pdf' in section: bar.next() total_pages -= 1 elif 'error' in section: bar.next() total_pages -= 1 errors_counter += 1 else: tasks.put({ 'title': section['title'], 'path': ('%s/%s.pdf' % (chapter_directory, section['title'].replace('/', '-'))), 'link': section['link'], 'chapter': chapter_number, 'section': section_number, }) # Add a poison pill for each consumer for i in range(num_consumers): tasks.put(None) while total_pages > 0: result = results.get() bar.next() logging.info("Task is done. file: %s" % (result['path'])) is_faulty = 'error' in result is_section = 'section' in result chapter_number = result['chapter'] if is_section: section_number = result['section'] if is_faulty: index[chapter_number]['sections'][section_number]['error'] = True else: index[chapter_number]['sections'][section_number]['pdf'] = result['path'] else: if is_faulty: index[chapter_number]['error'] = True else: index[chapter_number]['pdf'] = result['path'] store_index(index) total_pages -= 1 # Wait for all of the tasks to finish tasks.join() if errors_counter > 0: print('Got %s errors. check logs for more info' % errors_counter) return