def balanced(env, maxDocs, params): ret = [] visited = set() langsVisited = {} langsTodo = {} startNode = env.nodes[sys.maxsize] #print("startNode", startNode.Debug()) assert (len(startNode.links) == 1) link = next(iter(startNode.links)) while link is not None and len(visited) < maxDocs: node = link.childNode if node.urlId not in visited: #print("node", node.Debug()) visited.add(node.urlId) if node.lang not in langsVisited: langsVisited[node.lang] = 0 langsVisited[node.lang] += 1 if params.debug and len(visited) % 40 == 0: print(" langsVisited", langsVisited) for link in node.links: #print(" ", childNode.Debug()) AddTodo(langsTodo, visited, link) numParallelDocs = NumParallelDocs(env, visited) ret.append(numParallelDocs) link = PopLink(langsTodo, langsVisited, params) return ret
def dumb(env, maxDocs, params, breadthOrDepth): ret = [] todo = [] todo.append(env.rootNode) visited = set() langsVisited = {} while len(todo) > 0 and len(visited) < maxDocs: if breadthOrDepth == 0: node = todo.pop(0) else: node = todo.pop(-1) #print("node", node.Debug()) if node.urlId not in visited: visited.add(node.urlId) if node.lang not in langsVisited: langsVisited[node.lang] = 0 langsVisited[node.lang] += 1 if params.debug and len(visited) % 40 == 0: print(" langsVisited", langsVisited) for link in node.links: childNode = link.childNode #print(" ", childNode.Debug()) todo.append(childNode) numParallelDocs = NumParallelDocs(env, visited) ret.append(numParallelDocs) return ret
def randomCrawl(env, maxDocs, params): ret = [] todo = [] todo.append(env.rootNode) visited = set() langsVisited = {} while len(todo) > 0 and len(visited) < maxDocs: idx = np.random.randint(0, len(todo)) node = todo.pop(idx) #print("node", node.Debug()) if node.urlId not in visited: visited.add(node.urlId) if node.lang not in langsVisited: langsVisited[node.lang] = 0 langsVisited[node.lang] += 1 if params.debug and len(visited) % 40 == 0: print(" langsVisited", langsVisited) for link in node.links: childNode = link.childNode #print(" ", childNode.Debug()) todo.append(childNode) numParallelDocs = NumParallelDocs(env, visited) ret.append(numParallelDocs) return ret
def linkText(env, maxDocs, params): ret = [] visited = set() langsTodo = {} startNode = env.nodes[sys.maxsize] #print("startNode", startNode.Debug()) assert (len(startNode.links) == 1) link = next(iter(startNode.links)) while link is not None and len(visited) < maxDocs: node = link.childNode if node.urlId not in visited: #print("node", node.Debug()) visited.add(node.urlId) for link in node.links: #print(" ", childNode.Debug()) AddTodoLinkText(langsTodo, visited, link) numParallelDocs = NumParallelDocs(env, visited) ret.append(numParallelDocs) link = PopLinkLinkText(langsTodo, params) return ret
def Trajectory(env, epoch, params, sess, qns): ret = [] visited = set() langsVisited = np.zeros([1, 3]) # langId -> count candidates = Candidates(params, env) node = env.nodes[sys.maxsize] #stopNode = env.nodes[0] #link = Link("", 0, stopNode, stopNode) #candidates.AddLink(link) while True: tmp = np.random.rand(1) if tmp > 0.5: qnA = qns.q[0] qnB = qns.q[1] else: qnA = qns.q[1] qnB = qns.q[0] assert (node.urlId not in visited) #print("node", node.Debug()) visited.add(node.urlId) UpdateLangsVisited(langsVisited, node, params.langIds) #print(" langsVisited", langsVisited) candidates.AddLinks(node, visited, params) numParallelDocs = NumParallelDocs(env, visited) ret.append(numParallelDocs) transition = Neural(env, params, candidates, visited, langsVisited, sess, qnA, qnB) if transition.nextURLId == 0: break else: tmp = np.random.rand(1) if tmp > 0.5: corpus = qnA.corpus else: corpus = qnB.corpus corpus.AddTransition(transition) node = env.nodes[transition.nextURLId] if len(visited) > params.maxDocs: break return ret
def byCrawlDate(env, maxDocs, params): nodes = list(env.nodes.values()) print("nodes", len(nodes)) nodes.sort(key=lambda x: x.crawlDate) ret = [] visited = set() for node in nodes: if len(visited) >= maxDocs: break if node.urlId in (0, sys.maxsize): continue #print(" node", node.crawlDate, type(node.crawlDate)) if node.urlId not in visited: visited.add(node.urlId) numParallelDocs = NumParallelDocs(env, visited) ret.append(numParallelDocs) return ret
def Trajectory(env, params, sess, qn, corpus, test): ret = [] totReward = 0.0 totDiscountedReward = 0.0 discount = 1.0 startNode = env.nodes[sys.maxsize] nextVisited = set() nextVisited.add(startNode.urlId) nextCandidates = Candidates(params, env) nextCandidates.AddLinks(startNode, nextVisited, params) nextCandidates.Group(nextVisited) transition = Transition(env, -1, 0, None, params.langIds, None, None, nextVisited, nextCandidates) #print("candidates", transition.nextCandidates.Debug()) if test: mainStr = "lang:" + str(startNode.lang) rewardStr = "rewards:" actionStr = "actions:" while True: #print("candidates", transition.nextCandidates.Debug()) transition, reward = Neural(env, params, transition, sess, qn) #print("visited", len(transition.visited)) #print("candidates", transition.nextCandidates.Debug()) #print("transition", transition.Debug()) #print() numParallelDocs = NumParallelDocs(env, transition.visited) ret.append(numParallelDocs) totReward += reward totDiscountedReward += discount * reward discount *= params.gamma if test: mainStr += "->" + str(transition.link.childNode.lang) rewardStr += "->" + str(reward) actionStr += str(transition.action) + " " if transition.link.childNode.alignedNode is not None: mainStr += "*" else: corpus.AddTransition(transition) if transition.nextCandidates.Count() == 0: break if len(transition.visited) > params.maxCrawl: break if test: mainStr += " " + str(len(ret)) rewardStr += " " + str(totReward) + "/" + str(totDiscountedReward) print(actionStr) print(mainStr) print(rewardStr) return ret, totReward, totDiscountedReward
def Walk(env, params, sess, qns): ret = [] visited = set() langsVisited = np.zeros([1, 3]) # langId -> count candidates = Candidates(params, env) node = env.nodes[sys.maxsize] #stopNode = env.nodes[0] #link = Link("", 0, stopNode, stopNode) #candidates.AddLink(link) mainStr = "lang:" + str(node.lang) rewardStr = "rewards:" actionStr = "actions:" i = 0 numAligned = 0 totReward = 0.0 totDiscountedReward = 0.0 discount = 1.0 while True: qnA = qns.q[0] assert (node.urlId not in visited) #print("node", node.Debug()) visited.add(node.urlId) #print("node.lang", node.lang, langsVisited.shape) UpdateLangsVisited(langsVisited, node, params.langIds) #print(" langsVisited", langsVisited) candidates.AddLinks(node, visited, params) numParallelDocs = NumParallelDocs(env, visited) ret.append(numParallelDocs) #print("candidates", candidates.Debug()) _, _, _, _, _, _, _, _, action, link, reward = NeuralWalk( env, params, 0.0, candidates, visited, langsVisited, sess, qnA) node = link.childNode #print("action", action, qValues) actionStr += str(action) + " " totReward += reward totDiscountedReward += discount * reward mainStr += "->" + str(node.lang) rewardStr += "->" + str(reward) if node.alignedNode is not None: mainStr += "*" numAligned += 1 discount *= params.gamma i += 1 if node.urlId == 0: break if len(visited) > params.maxDocs: break mainStr += " " + str(i) rewardStr += " " + str(totReward) + "/" + str(totDiscountedReward) print(actionStr) print(mainStr) print(rewardStr) return ret, totReward, totDiscountedReward