Beispiel #1
0
 def __init__(self, radius, minPoint, distance=DDistance.DefaultDistance()):
     exceptions2.judge_type(distance, DDistance.DDdistance)
     exceptions2.judge_null(radius)
     exceptions2.judge_null(minPoint)
     self.distance = distance
     self.radius = radius
     self.minPoint = minPoint
Beispiel #2
0
 def __init__(self,radius,minPoint,distance = DDistance.DefaultDistance()):
     exceptions2.judge_type(distance,DDistance.DDdistance)
     exceptions2.judge_null(radius)
     exceptions2.judge_null(minPoint)
     self.distance = distance
     self.radius = radius
     self.minPoint = minPoint
Beispiel #3
0
def get_html_charset(html):
    """get html charset/encoding
        this is easy way to use chardet detect html encoding
    """
    if html is None:
        return None
    exceptions2.judge_type(html, basestring)
    return chardet.detect(html)['encoding']
Beispiel #4
0
 def __init__(self, name, *argv, **kw):
     self.log_level = kw.get("log_level", "warn")
     self.logger = log2.get_stream_logger(self.log_level, name)
     self.name = name
     self.allow_site = kw.get("allow_site", [])
     self.logger.info("spider {} init , allow_site {}".format(
         name, self.allow_site))
     self.start_urls = kw.get("start_urls", [])
     exceptions2.judge_null(self.start_urls)
     page_processor = kw.get("page_processor")
     if isinstance(page_processor, (list, tuple)):
         self.page_processor = page_processor
     elif isinstance(page_processor, PageProcessor):
         self.page_processor = [page_processor]
     else:
         raise TypeError("page_processor is list or PageProcessor")
     self.proxy_policy = kw.get("proxy_policy", None)
     if self.proxy_policy is not None:
         exceptions2.judge_type(self.proxy_policy, BaseProxyPolicy)
     self.fetcher = kw.get("fetcher", BaseRequestsFetcher())
     self.fetcher.setProxy(self.proxy_policy)
     self.pipelines = kw.get("pipeline", [ConsolePipeLine()])
     self.run_flag = True
     self.spid = rand2.get_random_seq(10)
     self.url_pool = kw.get("queue", MemoryFifoQueue(10000))
     self.logger.info("init")
     self.before_crawl = kw.get("before_crawl",
                                [])  # before crawl do something
     self.site_filters = [SiteFilter(site) for site in self.allow_site]
     url_filters = kw.get("url_filters", [])
     url_filters.extend(self.site_filters)
     """sort filter order by init param priority
     """
     sort2.sort_list_object(url_filters, "_priority")
     self.url_filters = url_filters
     self.listeners = SpiderListener()
     self.fetch_coding = kw.get("fetch_coding", None)
     self.listeners.addListener(
         kw.get("listeners", [DefaultSpiderListener()]))
     self.link_extractors = CssSelector("a[href]")
     self.crawled_filter = kw.get("crawled_filter", None)
     self.timeout = kw.get("timeout", 120)
Beispiel #5
0
 def __init__(self, t1, t2, calc_distance=DDistance.DefaultDistance()):
     exceptions2.judge_null(calc_distance)
     exceptions2.judge_type(calc_distance, DDistance.DDdistance)
     exceptions2.judge_type(t1, (int, float))
     exceptions2.judge_type(t2, (int, float))
     exceptions2.judge_smaller(t2, t1)
     self.t1 = t1
     self.t2 = t2
     self._calc_distance = calc_distance
Beispiel #6
0
 def __init__(self, t1, t2, calc_distance = DDistance.DefaultDistance()):
     exceptions2.judge_null(calc_distance) 
     exceptions2.judge_type(calc_distance,DDistance.DDdistance)
     exceptions2.judge_type(t1,(int,float))
     exceptions2.judge_type(t2,(int,float))
     exceptions2.judge_smaller(t2,t1)
     self.t1 = t1
     self.t2 = t2
     self._calc_distance = calc_distance
Beispiel #7
0
 def __init__(self,
              label,
              center_vector,
              distance=DDistance.DefaultDistance()):
     exceptions2.judge_type(label, (int, long, basestring))
     exceptions2.judge_type(center_vector, (list, tuple))
     exceptions2.judge_type(distance, DDistance.DDdistance)
     self.label = label
     self.vector = center_vector
     self._distance = distance
Beispiel #8
0
 def train(self, datas, labels, *argv, **kw):
     exceptions2.judge_null(datas)
     exceptions2.judge_null(labels)
     exceptions2.judge_type(datas, (list, tuple, DataSet.DataSet))
     for data, label in zip(datas, labels):
         self.__train(data, label)
Beispiel #9
0
 def train(self , datas , labels , *argv , **kw):
     exceptions2.judge_null(datas)
     exceptions2.judge_null(labels)
     exceptions2.judge_type(datas,(list,tuple,DataSet.DataSet))
     for data,label in zip(datas,labels):
         self.__train(data,label)
Beispiel #10
0
 def __init__(self, model_path=None):
     self._model = None
     if model_path:
         exceptions2.judge_type(model, basestring)
         self.model = self._load(model_path)