コード例 #1
0
    queue = Queue()
    results_queue = Queue()
    new_pages_count = 0

    for page in childs:
        if page['link'] not in CACHE:
            CACHE.append(page['link'])
            queue.put(page, timeout=10)

            # woraround for queue.put:
            time.sleep(0.01)

            new_pages_count += 1

    done = 0
    bar.max = len(CACHE)

    while not queue.empty():
        threads_count = 1 + (new_pages_count - done) / 2

        if threads_count > max_threads_count:
            threads_count = max_threads_count

        workers = [Process(target=collect_childs, args=(queue, results_queue))
                   for i in xrange(threads_count)]
        for w in workers:
            w.daemon = True
        [w.start() for w in workers]

        for w in workers:
            w.join(timeout=0.1)
コード例 #2
0
    queue = Queue()
    results_queue = Queue()
    new_pages_count = 0

    for page in childs:
        if page['link'] not in CACHE:
            CACHE.append(page['link'])
            queue.put(page, timeout=10)

            # woraround for queue.put:
            time.sleep(0.01)

            new_pages_count += 1

    done = 0
    bar.max = len(CACHE)

    while not queue.empty():
        threads_count = 1 + (new_pages_count - done) / 2

        if threads_count > max_threads_count:
            threads_count = max_threads_count

        workers = [
            Process(target=collect_childs, args=(queue, results_queue))
            for i in xrange(threads_count)
        ]
        for w in workers:
            w.daemon = True
        [w.start() for w in workers]
コード例 #3
0
ファイル: simplepg.py プロジェクト: eminga/simplEPG
def create_epg(config):
	now = pytz.utc.localize(datetime.datetime.utcnow())
	timespan_global = int(config.find("timespan_index").text)
	timespan_full_global = int(config.find("timespan_full").text)

	caching_global = config.find("caching").text
	if caching_global in ("on", "yes", "true", "True"):
		caching_global = True
	else:
		caching_global = False

	try:
		timespan_force_global = int(config.find("timespan_force").text)
	except TypeError:
		timespan_force_global = -1

	try:
		with open("cached_epg.pkl", "rb") as fp:
			cache = pickle.load(fp)
	except IOError:
		cache = {}

	cache_new = {}

	# root element of epg
	tv = ET.Element("tv")
	tv.set("generator-info-name", "simplEPG v0.1")
	tv.set("generator-info-url", "https://github.com/eminga/simplEPG")

	c_pos = 0

	successful = set()
	for channel in config.findall("channel"):
		try:
			site = importlib.import_module('sites.' + channel.get("site"))
		except ModuleNotFoundError:
			print("Error: could not find module sites." + channel.get("site"))
			continue
		channelid = channel.get("xmltv_id")
		print(channel.get("site") + ":" + channelid)

		if channelid in successful:
			print("channel already added, skipping...")
			continue

		try:
			timespan = int(channel.get("timespan_index"))
		except TypeError:
			timespan = timespan_global

		try:
			timespan_full = int(channel.get("timespan_full"))
		except TypeError:
			timespan_full = timespan_full_global

		try:
			caching = channel.get("caching")
			if caching is not None:
				if caching in ("on", "yes", "true", "True"):
					caching = True
				else:
					caching = False
			else:
				caching = caching_global
		except TypeError:
			caching = caching_global

		try:
			timespan_force = int(channel.get("timespan_force"))
		except TypeError:
			timespan_force = timespan_force_global
		if timespan_force == -1:
			timespan_force = -10000

		try:
			shows = site.grab(channel.get("site_id"), timespan)
		except (KeyboardInterrupt, SystemExit):
			raise
		except:
			shows = []
			print("An error occured:")
			print(sys.exc_info())


		if len(shows) > 0:
			successful.add(channelid)
			c = ET.Element("channel", id = channel.get("xmltv_id"))
			ET.SubElement(c, "display-name").text = channel.text
			tv.insert(c_pos, c)
			c_pos += 1

			# create progress bar if module is available
			try:
				from progress.bar import Bar
			except ImportError:
				class Bar:
					def __init__(self, label, max):
						self.max = max
						self.index = 0

					def next(self):
						self.index += 1

					def update(self):
						pass

					def finish(self):
						if self.index > 0:
							print("%s shows added." % self.index)


			bar = Bar("Processing", max=len(shows))

			for i in range(len(shows)):
				if isinstance(shows[i], type(ET.Element(None))):
					starttime = parse_xmltv_date(shows[i].get("start"))
					stoptime = shows[i].get("stop")
					if type(stoptime) != None:
						stoptime = parse_xmltv_date(stoptime)
						shows[i] = {"xml": shows[i], "start": starttime, "stop": stoptime}
					else:
						shows[i] = {"xml": shows[i], "start": starttime}

			shows.sort(key = lambda r: r["start"])

			for i in range(len(shows)):
				show = shows[i]
				if "stop" in show:
					stoptime = show["stop"]
				elif i < len(shows) - 1:
					stoptime = shows[i + 1]["start"]
				else:
					break

				# don't store shows that are already finished
				if stoptime < now:
					bar.max -= 1
					bar.max = max(bar.max, 1)
					continue

				starttime = show["start"]
				# don't store shows that start more than "timespan" hours in the future
				if (starttime - now).total_seconds() / 3600 > timespan:
					break

				if "xml" in show:
					show = show["xml"]
					show.set("channel", channelid)
					tv.append(show)
				else:
					url = show.pop("details-url", None)
					if url is not None and len(url) > 0:
						if timespan_full > -1 and (starttime - now).total_seconds() / 3600 <= timespan_full:
							if caching and (starttime - now).total_seconds() / 3600 > timespan_force:
								force = False
								try:
									try:
										details = cache_new[url]
									except KeyError:
										details = cache[url]
									show.update(details)
									cache_new[url] = details
								except KeyError:
									force = True
							else:
								force = True
							if force:
								try:
									details = site.grabdetails(url)
									show.update(details)
									if caching:
										# don't store times in cache
										details.pop("start", None)
										details.pop("stop", None)
										cache_new[url] = details
								except (AttributeError, TypeError):
									pass
					programme = ET.SubElement(tv, "programme", start=starttime.strftime("%Y%m%d%H%M%S %z"), stop=stoptime.strftime("%Y%m%d%H%M%S %z"), channel=channelid)
					process_show(programme, show)
				bar.next()
			if bar.index > 0:
				bar.max = bar.index
			else:
				print("0 shows found.")
			bar.update()
			bar.finish()
		else:
			print("0 shows found.")
	if len(cache_new) > 0:
		with open("cached_epg.pkl", "wb") as fp:
			cache = pickle.dump(cache_new, fp) 
	return tv
コード例 #4
0
def load_simple_questions_dataset(config, force_reload=False):
    bar = Bar(suffix='%(index)d/%(max)d - %(elapsed)ds')

    data_npz = os.path.join(config.data_dir, 'data.npz')
    word2idx_txt = os.path.join(config.data_dir, 'word2idx.txt')

    if (os.path.exists(data_npz) and os.path.exists(word2idx_txt)
            and not force_reload):
        bar.max = 2

        bar.message = 'Loading npz'
        bar.next()
        npz = np.load(data_npz)
        embd_mat = npz['embd_mat']
        train_ques = npz['train_ques'].astype(np.int32)
        train_ans = npz['train_ans'].astype(np.int32)
        valid_ques = npz['valid_ques'].astype(np.int32)
        valid_ans = npz['valid_ans'].astype(np.int32)

        bar.message = 'Loading word2idx'
        bar.next()
        with open(word2idx_txt) as f:
            reader = csv.reader(f, delimiter='\t')
            word2idx = {row[0]: int(row[1]) for row in reader}

        bar.finish()
        train = train_ques, train_ans
        valid = valid_ques, valid_ans
        return train, valid, embd_mat, word2idx

    bar.max = 8

    bar.message = 'Loading GloVe vocab'
    bar.next()
    glove_vocab = load_glove_vocab(os.path.join(config.data_dir, 'glove'),
                                   '42B', 300)

    bar.message = 'Loading SimpleQuestions'
    bar.next()
    train, valid, dataset_vocab = load_simple_questions(config)

    bar.message = 'Removing unknown answers'
    bar.next()
    train, new_vocab = remove_unknown_answers(train, glove_vocab)
    dataset_vocab.update(new_vocab)

    valid, new_vocab = remove_unknown_answers(valid, glove_vocab)
    dataset_vocab.update(new_vocab)

    train_q, train_a = train[0], train[1]
    valid_q, valid_a = valid[0], valid[1]

    bar.message = 'Replacing unknown tokens'
    bar.next()
    unknowns = dataset_vocab - glove_vocab
    train_q = replace_unknowns(train_q, unknowns)
    train_a = replace_unknowns(train_a, unknowns)
    valid_q = replace_unknowns(valid_q, unknowns)
    valid_a = replace_unknowns(valid_a, unknowns)
    vocab = dataset_vocab - unknowns

    bar.message = 'Appending pads'
    bar.next()
    max_len = max(len(sent) for sent in train_q + valid_q)
    train_q = append_pads(train_q, max_len)
    valid_q = append_pads(valid_q, max_len)
    vocab.update([TOK_UNK, TOK_PAD])

    bar.message = 'Loading GloVe embeddings'
    bar.next()
    embd_mat, word2idx = load_glove_embeddings(
        os.path.join(config.data_dir, 'glove'), '42B', 300, vocab)

    bar.message = 'Converting token to index'
    bar.next()
    train_q = convert_to_idx(train_q, word2idx)
    train_a = convert_to_idx(train_a, word2idx)
    valid_q = convert_to_idx(valid_q, word2idx)
    valid_a = convert_to_idx(valid_a, word2idx)

    bar.message = 'Saving processed data'
    bar.next()
    with open(word2idx_txt, 'w') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerows(word2idx.items())
    data_dict = dict(embd_mat=embd_mat,
                     train_ques=train_q,
                     train_ans=train_a,
                     valid_ques=valid_q,
                     valid_ans=valid_a)
    np.savez(data_npz, **data_dict)

    bar.finish()
    train = np.array(train_q), np.array(train_a)
    valid = np.array(valid_q), np.array(valid_a)
    return train, valid, embd_mat, word2idx