def main(): client = MongoClient(settings.DATABASE) db_name = 'google-2015-11-03' db = client[db_name] gr = db.urls.find() for query in gr: print query['query'] for i in range(100): if query['sites'][i].startswith('http'): print query['sites'][i] if db.index.find({u'url': query['sites'][i]}).count() == 1: continue p = Parser() p.url = query['sites'][i] p.open_url() p.get_links() p.get_elements() p.set_elements() p.result['query'] = query['query'] p.result['position'] = i + (query['search_page'] * 100) db.index.insert_one(p.result) t = TextAnalyzer(p.result) t.keys = query['query'].split() t.analyze()
def main(): p = Parser('http://google.com.ua/', False) db_name = 'google-' + date.today().isoformat() Parser.db = Parser.client[db_name] p.regulars = {u'sites': u'//h3[@class="r"]/a/@href'} for n, q in enumerate(queries): for j in range(3): start = j * 100 start = '&start=' + str(start) p.url = google + urllib.quote(qa(q).encode('cp1251')) + start p.open_url() p.get_elements() p.set_elements() p.result[u'query'] = q p.result[u'sites'] = norm(p.result[u'sites']) p.result[u'search_page'] = j p.save() p.clean() print u'Запрос: [{}] "{}" отсканирован и сохранен'.format(n, q)