Example #1
0
def main():

    client = MongoClient(settings.DATABASE)
    db_name = 'google-2015-11-03'
    db = client[db_name]
    gr = db.urls.find()

    for query in gr:
        print query['query']
        for i in range(100):
            if query['sites'][i].startswith('http'):
                print query['sites'][i]
                if db.index.find({u'url': query['sites'][i]}).count() == 1:
                    continue
                p = Parser()
                p.url = query['sites'][i]
                p.open_url()
                p.get_links()
                p.get_elements()
                p.set_elements()
                p.result['query'] = query['query']
                p.result['position'] = i + (query['search_page'] * 100)
                db.index.insert_one(p.result)
                t = TextAnalyzer(p.result)
                t.keys = query['query'].split()
                t.analyze()
Example #2
0
def main():
    p = Parser('http://google.com.ua/', False)
    db_name = 'google-' + date.today().isoformat()
    Parser.db = Parser.client[db_name]
    p.regulars = {u'sites': u'//h3[@class="r"]/a/@href'}

    for n, q in enumerate(queries):
        for j in range(3):
            start = j * 100
            start = '&start=' + str(start)
            p.url = google + urllib.quote(qa(q).encode('cp1251')) + start
            p.open_url()
            p.get_elements()
            p.set_elements()
            p.result[u'query'] = q
            p.result[u'sites'] = norm(p.result[u'sites'])
            p.result[u'search_page'] = j
            p.save()
            p.clean()
            print u'Запрос: [{}] "{}" отсканирован и сохранен'.format(n, q)