Ejemplo n.º 1
0
 def main():
     sd = input("Start Date(yyyy,m,d): ")
     ed = input("End Date(yyyy,m,d): ")
     print(datetime.datetime.now())
     multiParsedTagList = hp.get_fullParsedTagList(sd, ed)
     tagSelect = sc.get_singlePageInfo(multiParsedTagList)
     pageInfos = sc.get_pageInfos(tagSelect)
     #pp.print_mergedList(pageInfos)
     pp.save_csv(pageInfos, sd, ed)
     print(datetime.datetime.now())
Ejemplo n.º 2
0
def run():
    sets = Pipeline(time.site_id, time.site_name).structure_set()
    Pipeline(time.site_id, time.site_name).open_spider(sets)

    for item in Time().first_requests():
        Pipeline(time.site_id, time.site_name).process_item(item)
        Pipeline(time.site_id, time.site_name).upload_item(item, sets)

    try:
        Pipeline(time.site_id, time.site_name).close_spider()
    except:
        Logger().setLogger(time.log_path, 4, "Failed to close spider,db_session may failed")
Ejemplo n.º 3
0
 def __init__(self, browser, settings, *a, **kw):
     self.browser = browser
     self.settings = settings
     if self.browser:
         self._init_from_db()
     self.internal_err = True
     self.deny = True
     self.clawed_urls = []
     self.site_urls = []
     self.currentUrl = ""
     self.started = False
     self.pipeline = Pipeline(self.settings.get('MYSERVER_URI'))
     self.proxyUrl = self.settings.get("SPLASH_URL")
     logger.info('spider init is finished!')
def run():
    sets = Pipeline(bbc.site_id, bbc.site_name).structure_set()
    Pipeline(bbc.site_id, bbc.site_name).open_spider(sets)

    urls = BBC().first_requests()
    for item in BBC().second_requests(urls):
        Pipeline(bbc.site_id, bbc.site_name).process_item(item)
        Pipeline(bbc.site_id, bbc.site_name).upload_item(item, sets)

    try:
        Pipeline(bbc.site_id, bbc.site_name).close_spider()
    except:
        Logger().setLogger(bbc.log_path, 4,
                           "Failed to close spider,db_session may failed")
        pass
Ejemplo n.º 5
0
    def __init__(s, dtype, stages, pipeq, bypassq):

        s.in_ = InValRdyBundle(dtype)
        s.out = OutValRdyBundle(dtype)

        s.in_q = InValRdyQueue(dtype, pipe=pipeq)
        s.out_q = OutValRdyQueue(dtype, bypass=bypassq)

        s.pipe = Pipeline(stages)
        s.connect(s.in_, s.in_q.in_)
        s.connect(s.out, s.out_q.out)

        @s.tick
        def logic():

            # Automatically enq from input / deq from output
            s.in_q.xtick()
            s.out_q.xtick()

            # No stall
            if not s.out_q.is_full():

                # Insert item into pipeline from input queue
                if not s.in_q.is_empty():
                    s.pipe.insert(s.in_q.deq())

                # Items graduating from pipeline, add to output queue
                if s.pipe.ready():
                    s.out_q.enq(s.pipe.remove())

                # Advance the pipeline
                s.pipe.advance()
Ejemplo n.º 6
0
def run():
    sets = Pipeline(hq.site_id, hq.site_name).structure_set()
    Pipeline(hq.site_id, hq.site_name).open_spider(sets)

    detail_url = Huanqiu().first_requests()

    for item in Huanqiu().second_requests(detail_url):
        Huanqiu().process_item(item)

        Pipeline(hq.site_id, hq.site_name).process_item(item)
        Pipeline(hq.site_id, hq.site_name).upload_item(item, sets)

    try:
        Pipeline(hq.site_id, hq.site_name).close_spider()
    except:
        Logger().setLogger(hq.log_path, 4, "Failed to close spider,db_session may failed")
        pass
  def __init__( s, out, nstages=1 ):

    s.nstages    = nstages

    # instantiate a single-entry bypass queue adapter
    s.out_q      = OutValRdyQueueAdapter( out )

    # instantiate a cycle-level pipeline
    if s.nstages > 0:
      s.pipe       = Pipeline( s.nstages )
Ejemplo n.º 8
0
    def assess(model,
               df,
               columns,
               metrics,
               n_splits=5,
               early_stopping_rounds=20,
               verbose=0):
        """
        k-fold cross-validation
    
        Checkpoints saving strategy ...
        :param model: sklearn-like object
        :param df: DataFrame with X and y
        :param columns: column names splited by types like utils.split_columns_by_types
        :param metrics: sklearn.metrics like function
        :param n_splits: the number of folds
        :param early_stopping_rounds: LightGBM param
        :param verbose: 0 - no logs, 1 - info, 2 - debug
        :return: iterations log
        """
        if n_splits == 1:
            total_rows = df.shape[0]
            train_size = int(0.95 * total_rows)
            splits = [(df.index[:train_size], df.index[train_size:])]
        else:
            splits = kfold_with_respect_to_groups(df, n_splits=n_splits)
        log = []
        for train_index, valid_index in splits:
            print('\n---------------------------')
            with Timer('Data Preparation:', verbose):
                pipeline = Pipeline(**columns, verbose=verbose)
                x_train = pipeline.fit_transform(df.loc[train_index, :])
                y_train = df.loc[train_index, columns['target']]
                x_valid = pipeline.transform(df.loc[valid_index, :])
                y_valid = df.loc[valid_index, columns['target']]

            with Timer('Fitting:', verbose):
                model.fit(
                    x_train,
                    y_train,
                    eval_set=[(x_valid, y_valid)],
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=-1 if verbose != 2 else 1,
                )

            # with Timer('Postprocessing:', verbose):
            #     pred_train = scores_postprocessing(
            #         df=df.loc[train_index, :],
            #         predicted=model.predict(x_train),
            #         columns=columns,
            #         is_test=False,
            #     )[columns['target']]
            #     pred_valid = scores_postprocessing(
            #         df=df.loc[valid_index, :],
            #         predicted=model.predict(x_valid),
            #         columns=columns,
            #         is_test=False,
            #     )[columns['target']]
            pred_train, pred_valid = model.predict(x_train), model.predict(
                x_valid)

            with Timer('Saving:', verbose):
                train_score = metrics(y_train, pred_train)
                valid_score = metrics(y_valid, pred_valid)
                step = dict(
                    model=model,
                    pipeline=pipeline,
                    train_score=train_score,
                    valid_score=valid_score,
                    not_adj_train_score=metrics(y_train,
                                                model.predict(x_train)),
                    not_adj_valid_score=metrics(y_valid,
                                                model.predict(x_valid)),
                    train_index=train_index,
                    valid_index=valid_index,
                    path=None,
                    cached=False,
                )
                try:
                    step = save_model(step)
                except Exception:
                    if verbose == 1:
                        print("Warning: Couldn't save the model")
                log.append(step)
                gc.collect()

            if verbose == 1:
                print(step['train_score'], step['valid_score'])
            print('---------------------------\n')

        if verbose == 1:
            print('Erasing cache ...')
        for idx, step in enumerate(
                sorted(log, key=lambda dct: dct['valid_score'], reverse=True)):
            if idx == 0:
                step['best'] = True
                continue
            step['best'] = False

            try:
                os.remove(step['path'])
                if verbose == 2:
                    print('Removed:', step['abspath'])
            except Exception:
                if verbose == 2:
                    print("Warning: Couldn't remove file:", step['abspath'])
        return log
Ejemplo n.º 9
0
    logging.info(f"Command executed: {' '.join(sys.argv)}")
    logging.info("Starting outside variant pipeline analysis")
    file1 = args.case_gen
    file2 = args.control_gen
    pairing = args.SNP_pairs
    init_file = args.init_file
    p_file = args.output_folder
    override_folder = args.override



    odds_file = ""

    logging.info("Initializing pipeline. This might take a few seconds.")
    args.exec_dir = os.getcwd()
    with cd(args.input_folder_path):
        pipe = Pipeline.init_from_file(
            init_file, file1, file2, pairing, p_file, odds_file, args)
    logging.info("Making output directory")
    working_dir = make_working_dir(p_file, override_folder)
    pipe.working_dir = working_dir
    pipe.p_value_filename = p_file.split("/")[-1]
    pipe.hash = make_hash(args.input_folder_path, init_file,
                          file1, file2, pairing, args.unique_identifier)

    with cd(args.input_folder_path):
        pipe.read_input_files()
    logging.info("Running pipeline...")
    with cd(pipe.working_dir):
        pipe.run()
Ejemplo n.º 10
0
class BaseSpider(object):
    """The most top class."""
    start_urls = []
    start_host = "http://www.bjpc.gov.cn/"
    filter_urls = []
    init_db = False
    depart = ""

    def __init__(self, browser, settings, *a, **kw):
        self.browser = browser
        self.settings = settings
        if self.browser:
            self._init_from_db()
        self.internal_err = True
        self.deny = True
        self.clawed_urls = []
        self.site_urls = []
        self.currentUrl = ""
        self.started = False
        self.pipeline = Pipeline(self.settings.get('MYSERVER_URI'))
        self.proxyUrl = self.settings.get("SPLASH_URL")
        logger.info('spider init is finished!')

    def _init_from_db(self):
        self.client = pymongo.MongoClient(self.settings.get('MONGO_URI'))
        self.db = self.client[self.settings.get('MONGO_DATABASE', 'test')]

    def start(self, site, startUrl=None):
        res = self.db.GovDepartment.find_one({"key": site})
        self._init_gov_data(res)
        logger.info('start crawl %s!' % startUrl)
        #self.start_requests()
        self.request(startUrl)
        self.destroy_init_data()

    def _is_filter_url(self, url):
        if not url.startwith(self.start_host):
            return True  #only claw in self site
        if self.deny:
            for u in self.filter_urls:
                if url.startwith(u):
                    return True
            return False
        else:
            for u in self.filter_urls:
                if url.startwith(u):
                    return False
            return True

    def destroy_init_data(self):
        self.condition = ""
        self.start_urls = []
        del self.fields
        self.init_db = False
        del self.link_extractor
        self.f.close()
        #self.start_host = res["link"]

    def _init_gov_data(self, gov):
        try:
            self.condition = gov["condition"]
            self.start_urls.append(gov["link"])
            #self.browser = webdriver.Firefox()
            self.start_host = gov["link"]
            self.f = open(gov["key"] + ".txt", 'a+')

            self.fields = {}
            self.init_db = True
            self.internal_err = False
            self.link_extract()
            for field in gov["fields"]:
                self.fields[field["name"]] = field["xpath"]
        except Exception as e:
            print "_init_gov_data error: %s" % e.message

    def link_extract(self):
        self.link_extractor = None

    def start_requests(self):
        for url in self.start_urls:
            self.request(url)

    def close(self):
        logger.warning('self mongo db closed')
        self.client.close()

    def getResponse(self, url, browser):
        res = TextResponse(url, body=browser.page_source.encode("utf-8"))
        return res

    def _getPage(self, url, browser):
        newUrl = "%s%s%s%s" % (self.proxyUrl, "?url=", url,
                               "&timeout=10&wait=0.5")
        print "get newUrl %s" % newUrl
        self.currentUrl = url
        browser.get(newUrl)

    def request(self, url):
        pass

    def get_item(self, response):
        il = PageItemLoader(item=PageContentItem(depart=self.depart),
                            response=response)
        il.add_value('link', response.url)
        for (k, v) in self.fields.items():
            il.add_xpath(k, v)  #only support simple select current

        return il.load_item()

    def save_item(self, item):
        self.pipeline.send_item(item)

    def satisfy_craw(self, response):
        for condition in self.condition:
            data = response.xpath(condition).extract()
            if data != []:
                return True
        return False

    def crawedAppend(self, url):
        if not self.hasCrawedUrl(url):
            self.clawed_urls.append(url)
            self.f.write(url.encode("utf-8") + "\n")

    def hasCrawedUrl(self, url):
        if not url in self.clawed_urls:
            return False
        return True

    def add_urls_noduplicate(self, site_urls):
        res = []
        if isinstance(site_urls, list):
            for link in site_urls:
                self._add_url_nodup(link.url, res)
        else:
            self._add_url_nodup(site_urls, res)
        return res

    def _add_url_nodup(self, url, dstArr):
        if not url in dstArr:
            dstArr.append(url)
Ejemplo n.º 11
0
def test_Pipeline(dump_vcd, stages):

    # Create the pipeline
    pipeline = Pipeline(stages)
    pipeline.vcd_file = dump_vcd

    # Fill up the pipeline
    i = -1
    for i in range(stages - 1):
        pipeline.advance()
        pipeline.insert(i)
        assert not pipeline.ready()

    # Insert one last item
    pipeline.advance()
    pipeline.insert(i + 1)

    # Make sure there is something at the tail of the pipeline
    assert pipeline.ready()

    # Start removing items from the pipeline
    for i in range(stages):
        assert pipeline.ready()
        assert pipeline.remove() == i
        pipeline.advance()

    assert not pipeline.ready()
Ejemplo n.º 12
0
def test_Pipeline(dump_vcd, stages):

    # Create the pipeline
    pipeline = Pipeline(stages)
    pipeline.vcd_file = dump_vcd

    # Fill up the pipeline
    i = -1
    for i in range(stages - 1):
        pipeline.advance()
        pipeline.insert(i)
        assert not pipeline.ready()

    # Insert one last item
    pipeline.advance()
    pipeline.insert(i + 1)

    # Make sure there is something at the tail of the pipeline
    assert pipeline.ready()

    # Start removing items from the pipeline
    for i in range(stages):
        assert pipeline.ready()
        assert pipeline.remove() == i
        pipeline.advance()

    assert not pipeline.ready()