Example #1
0
def fetch_trigrams():

    for tag in TARGET_TAGS:

        # This manual transfer to a list seems necessary, instead of interating
        # through the selected lines, due to an exception that is thrown
        # in the latter case.
        tag_pairs = []
        for bg in Bigram.select().where(Bigram.tag1 == tag):
            tag_pairs.append((bg.tag1, bg.tag2))

        for pair in tag_pairs:
            resp = session.get(make_url([pair[0], pair[1]]), params={
                'pagesize': 100,
                'site': 'stackoverflow',
            })
            respJson = resp.json()
            if 'items' not in respJson.keys():
                continue

            for i in respJson['items']:
                tags = [pair[0], pair[1], i['name']]
                tags_ord = sorted(tags)
                try:
                    tg = Trigram.create(tag1=tags_ord[0], tag2=tags_ord[1], tag3=tags_ord[2])
                except peewee.IntegrityError:
                    tg = Trigram.get(
                        Trigram.tag1 == tags_ord[0],
                        Trigram.tag2 == tags_ord[1],
                        Trigram.tag3 == tags_ord[2],
                    )
                tg.count = i['count']
                tg.save()
Example #2
0
def get_results(pages=300):

    for ml in TAGS:

        ml_results = []
        ml_links = set()

        trigrams = (
            Trigram.select()
            .where((Trigram.tag1 == ml) | (Trigram.tag2 == ml) | (Trigram.tag3 == ml))
            .order_by(Trigram.count.desc())
        )

        for tg in trigrams:

            query = " ".join([tg.tag1, tg.tag2, tg.tag3]) + " tutorial"
            res = fetch_results(query)

            for rank, r in enumerate(res, 1):
                save_page(ml, query, r["link"], rank, r["title"])

            for r in res:
                if r["link"] not in ml_links:
                    ml_results.append(r)
                    ml_links.add(r["link"])

            if len(ml_results) >= pages:
                break

        write_results_file(os.path.join("search_results", ml + "-results.json"), ml_results)
Example #3
0
        # convert variables to the right types
        xloc = float(xloc)
        yloc = float(yloc)
        loc = (xloc, yloc)
        parse = ParentedTree.parse(parse)
        modparse = ParentedTree.parse(modparse)

        # how many ancestors should the sampled landmark have?
        num_ancestors = count_lmk_phrases(modparse) - 1

        # sample `args.iterations` times for each sentence
        for _ in xrange(args.iterations):
            lmk, rel = get_meaning(loc, num_ancestors)

            if args.verbose:
                print "utterance:", repr(sentence)
                print "location: %s" % repr(loc)
                print "landmark: %s (%s)" % (lmk, lmk_id(lmk))
                print "relation: %s" % rel_type(rel)
                print "parse:"
                print parse.pprint()
                print "modparse:"
                print modparse.pprint()
                print "-" * 70

            location = Location(x=xloc, y=yloc)
            save_tree(modparse, location, rel, lmk)
            Bigram.make_bigrams(location.words)
            Trigram.make_trigrams(location.words)
            session.commit()
Example #4
0
                    tg = Trigram.create(tag1=tags_ord[0], tag2=tags_ord[1], tag3=tags_ord[2])
                except peewee.IntegrityError:
                    tg = Trigram.get(
                        Trigram.tag1 == tags_ord[0],
                        Trigram.tag2 == tags_ord[1],
                        Trigram.tag3 == tags_ord[2],
                    )
                tg.count = i['count']
                tg.save()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Download n-grams of StackOverflow tags")
    parser.add_argument('--init', action='store_true', help='create database tables')
    parser.add_argument('--tri', help='print top 3-grams for a tag')
    args = parser.parse_args()

    if args.tri:
        tgs = Trigram.select().where(
            (Trigram.tag1 == args.tri) |
            (Trigram.tag2 == args.tri) |
            (Trigram.tag3 == args.tri)
        ).order_by(Trigram.count.desc()).limit(10)
        for tg in tgs:
            print ' '.join([tg.tag1, tg.tag2, tg.tag3])
    else:
        if args.init:
            create_tables()
        fetch_bigrams()
        fetch_trigrams()
Example #5
0
            if args.verbose:
                print 'utterance:', repr(sentence)
                print 'location: %s' % repr(loc)
                print 'landmark: %s (%s)' % (lmk, lmk_id(lmk))
                print 'relation: %s' % rel_type(rel)
                print 'parse:'
                print parse.pprint()
                print 'modparse:'
                print modparse.pprint()
                print '-' * 70

            location = Location(x=xloc, y=yloc)
            save_tree(modparse, location, rel, lmk)
            Bigram.make_bigrams(location.words)
            Trigram.make_trigrams(location.words)

        if i % 200 == 0: session.commit()

    for sentence,(parse,modparse) in unique_sentences.items():
        SentenceParse.add_sentence_parse_blind(sentence, parse, modparse)

    session.commit()

    print 'counting ...'

    # count words
    w1 = aliased(Word)
    w2 = aliased(Word)
    parent = aliased(Production)
    qry = session.query(w1.word, w2.word, w2.pos,