Beispiel #1
0
 def safely_call(self, func): # TODO We need to handle "connection aborted" ConnectionResetError too.
     """A convenience function which calls the 0-ary function supplied, while handling Wikipedia errors."""
     try:
         return func()
     except wikipedia.PageError as e:
         echo("Found red link", e)
         return None
     except wikipedia.DisambiguationError as e:
         echo("Ambiguous article found", e)
         return None
Beispiel #2
0
def expr_run(args):
    try:
        exprs = command.read(args.expr())
        parts = {}
        for expr in exprs:
            search.CommandSearch(expr, parts).run()
        print(ET.tostring(xmlify.xmlify(parts)).decode())
    except TokenizeError as e:
        logger.echo(str(e))
        print("<data />")
Beispiel #3
0
 def __init__(self, argv, arg_set):
     """
     Parses the argument list using getopt. The specific allowable arguments should be
     passed as the second argument, as an instance of ArgSet.
     """
     try:
         arglist = arg_set.string
         self._args = dict(getopt(argv, arglist)[0])
     except GetoptError as e:
         self._args = {}
         echo("Error in arguments:", e)
Beispiel #4
0
 def _crawl_once(page, depth_):
     self.wait()
     echo("Trying", escape(page.title), "at", depth_, flush = True)
     if match_function(page):
         echo("Taking", escape(page.title))
         return page
     elif depth_ >= self.depth:
         return None
     else:
         state = LinkState(page, self.depth - depth_, self.depth)
         link = self.selector.select_link(state)
         if link is None:
             return None
         new_page = wikipedia.page(link)
         return _crawl_once(new_page, depth_ + 1)
Beispiel #5
0
 def select_link(self, state):
     page = state.page()
     if not self.pages: # We are on the first page so store this one too
         self.pages.append(page.title)
     pages = filter(lambda x: x not in self.pages, page.links)
     pages = filter(lambda x: self.db.get_score(x).denom > 0, pages)
     pages = map(lambda x: (x, float(self.db.get_score(x))), pages)
     pages = list(pages)
     if not pages or self.should_explore(state):
         echo("Exploring", level = 2)
         result = self.explore.select_link(state)
     else:
         echo("Using prior knowledge", level = 2)
         result = self._weighted_random(pages)
     if result:
         self.pages.append(result)
     return result
Beispiel #6
0
 def _crawl_once(page, depth_):
     self.wait()
     echo(" Trying:",
          escape(page.title),
          "(" + str(depth_) + ")",
          flush=True)
     if match_function(page):
         echo("  Accepted:", escape(page.title))
         return page
     elif depth_ >= self.depth:
         return None
     else:
         state = LinkState(page, self.depth - depth_, self.depth)
         link = self.selector.select_link(state)
         if link is None:
             return None
         new_page = wikipedia.page(link)
         return _crawl_once(new_page, depth_ + 1)
Beispiel #7
0
 def _safely_call(n):
     try:
         return func()
     except wikipedia.PageError as e:
         echo("Found red link:", werror.wrap(e))
         return None
     except wikipedia.DisambiguationError as e:
         echo("Ambiguous article found:", werror.wrap(e))
         return None
     except ConnectionResetError as e:
         if n < self.max_aborts:
             echo("Connection reset", e, "...", "retrying")
             return _safely_call(n + 1)
         else:
             echo("Connection reset", e, "...", "aborting")
             return None
     except requests.exceptions.ConnectionError as e:
         echo("Aborting because of connection error", e)
         return None
Beispiel #8
0
 def crawl_times(self, base, match_function):
     """
     Performs self.crawl_once until a single match is found or self.max_tries attempts have
     been made.
     """
     if type(base) is str:
         echo("Basis:", escape(base))
         base = self.safely_call(lambda: wikipedia.page(base))
         if not base:
             return None
     else:
         echo("Basis:", escape(base.title))
     for i in range(0, self.max_tries):
         res = self.crawl_once(base, match_function)
         if res:
             return res
         else:
             echo("  Rejected")