def get_results(pages=300): for ml in TAGS: ml_results = [] ml_links = set() trigrams = ( Trigram.select() .where((Trigram.tag1 == ml) | (Trigram.tag2 == ml) | (Trigram.tag3 == ml)) .order_by(Trigram.count.desc()) ) for tg in trigrams: query = " ".join([tg.tag1, tg.tag2, tg.tag3]) + " tutorial" res = fetch_results(query) for rank, r in enumerate(res, 1): save_page(ml, query, r["link"], rank, r["title"]) for r in res: if r["link"] not in ml_links: ml_results.append(r) ml_links.add(r["link"]) if len(ml_results) >= pages: break write_results_file(os.path.join("search_results", ml + "-results.json"), ml_results)
tg = Trigram.create(tag1=tags_ord[0], tag2=tags_ord[1], tag3=tags_ord[2]) except peewee.IntegrityError: tg = Trigram.get( Trigram.tag1 == tags_ord[0], Trigram.tag2 == tags_ord[1], Trigram.tag3 == tags_ord[2], ) tg.count = i['count'] tg.save() if __name__ == '__main__': parser = argparse.ArgumentParser(description="Download n-grams of StackOverflow tags") parser.add_argument('--init', action='store_true', help='create database tables') parser.add_argument('--tri', help='print top 3-grams for a tag') args = parser.parse_args() if args.tri: tgs = Trigram.select().where( (Trigram.tag1 == args.tri) | (Trigram.tag2 == args.tri) | (Trigram.tag3 == args.tri) ).order_by(Trigram.count.desc()).limit(10) for tg in tgs: print ' '.join([tg.tag1, tg.tag2, tg.tag3]) else: if args.init: create_tables() fetch_bigrams() fetch_trigrams()