def fetch_trigrams(): for tag in TARGET_TAGS: # This manual transfer to a list seems necessary, instead of interating # through the selected lines, due to an exception that is thrown # in the latter case. tag_pairs = [] for bg in Bigram.select().where(Bigram.tag1 == tag): tag_pairs.append((bg.tag1, bg.tag2)) for pair in tag_pairs: resp = session.get(make_url([pair[0], pair[1]]), params={ 'pagesize': 100, 'site': 'stackoverflow', }) respJson = resp.json() if 'items' not in respJson.keys(): continue for i in respJson['items']: tags = [pair[0], pair[1], i['name']] tags_ord = sorted(tags) try: tg = Trigram.create(tag1=tags_ord[0], tag2=tags_ord[1], tag3=tags_ord[2]) except peewee.IntegrityError: tg = Trigram.get( Trigram.tag1 == tags_ord[0], Trigram.tag2 == tags_ord[1], Trigram.tag3 == tags_ord[2], ) tg.count = i['count'] tg.save()
def get_results(pages=300): for ml in TAGS: ml_results = [] ml_links = set() trigrams = ( Trigram.select() .where((Trigram.tag1 == ml) | (Trigram.tag2 == ml) | (Trigram.tag3 == ml)) .order_by(Trigram.count.desc()) ) for tg in trigrams: query = " ".join([tg.tag1, tg.tag2, tg.tag3]) + " tutorial" res = fetch_results(query) for rank, r in enumerate(res, 1): save_page(ml, query, r["link"], rank, r["title"]) for r in res: if r["link"] not in ml_links: ml_results.append(r) ml_links.add(r["link"]) if len(ml_results) >= pages: break write_results_file(os.path.join("search_results", ml + "-results.json"), ml_results)
# convert variables to the right types xloc = float(xloc) yloc = float(yloc) loc = (xloc, yloc) parse = ParentedTree.parse(parse) modparse = ParentedTree.parse(modparse) # how many ancestors should the sampled landmark have? num_ancestors = count_lmk_phrases(modparse) - 1 # sample `args.iterations` times for each sentence for _ in xrange(args.iterations): lmk, rel = get_meaning(loc, num_ancestors) if args.verbose: print "utterance:", repr(sentence) print "location: %s" % repr(loc) print "landmark: %s (%s)" % (lmk, lmk_id(lmk)) print "relation: %s" % rel_type(rel) print "parse:" print parse.pprint() print "modparse:" print modparse.pprint() print "-" * 70 location = Location(x=xloc, y=yloc) save_tree(modparse, location, rel, lmk) Bigram.make_bigrams(location.words) Trigram.make_trigrams(location.words) session.commit()
tg = Trigram.create(tag1=tags_ord[0], tag2=tags_ord[1], tag3=tags_ord[2]) except peewee.IntegrityError: tg = Trigram.get( Trigram.tag1 == tags_ord[0], Trigram.tag2 == tags_ord[1], Trigram.tag3 == tags_ord[2], ) tg.count = i['count'] tg.save() if __name__ == '__main__': parser = argparse.ArgumentParser(description="Download n-grams of StackOverflow tags") parser.add_argument('--init', action='store_true', help='create database tables') parser.add_argument('--tri', help='print top 3-grams for a tag') args = parser.parse_args() if args.tri: tgs = Trigram.select().where( (Trigram.tag1 == args.tri) | (Trigram.tag2 == args.tri) | (Trigram.tag3 == args.tri) ).order_by(Trigram.count.desc()).limit(10) for tg in tgs: print ' '.join([tg.tag1, tg.tag2, tg.tag3]) else: if args.init: create_tables() fetch_bigrams() fetch_trigrams()
if args.verbose: print 'utterance:', repr(sentence) print 'location: %s' % repr(loc) print 'landmark: %s (%s)' % (lmk, lmk_id(lmk)) print 'relation: %s' % rel_type(rel) print 'parse:' print parse.pprint() print 'modparse:' print modparse.pprint() print '-' * 70 location = Location(x=xloc, y=yloc) save_tree(modparse, location, rel, lmk) Bigram.make_bigrams(location.words) Trigram.make_trigrams(location.words) if i % 200 == 0: session.commit() for sentence,(parse,modparse) in unique_sentences.items(): SentenceParse.add_sentence_parse_blind(sentence, parse, modparse) session.commit() print 'counting ...' # count words w1 = aliased(Word) w2 = aliased(Word) parent = aliased(Production) qry = session.query(w1.word, w2.word, w2.pos,