def get_initial_requests(self): seeds = set(self.seeds) for fname in self.seed_files: with open(fname, 'r') as f: seeds.update(l.strip() for l in f) for seed in seeds: url, _ = norm_url(seed) yield Request(url, meta = { 'lcrawl.labels' : self.get_initial_state(url) })
def load_from_txt(cls, filename): with open(filename, 'r') as f: return cls([(labels.split(','), norm_url(url)[0]) for labels, url in (l.strip().split(' ') for l in f)], filename)