Ejemplo n.º 1
0
    def loadTerms(self):
        termFile = os.path.join(srcDir, "nlp", "new_term.json")
        terms = web_util.load_json(termFile)
        airportFile = os.path.join(srcDir, "nlp", "airport.json")
        airport = web_util.load_json(airportFile)
        hotelFile = os.path.join(srcDir, "nlp", "hotel.json")
        hotels = web_util.load_json(hotelFile)

        self.restaurantList = []
        self.spotList = []
        self.hotelList = []

        for air in airport:
            self.airport = View((air), airport[air])

        for hotel in hotels:
            view = View(hotel, hotels[hotel])
            self.hotelList.append(view)

        for term in terms:
            if terms[term]["popularity"] == 0:  # ignore
                continue

            view = View(term, terms[term])
            for topic in view.topicList:
                if topic[0] == "0":
                    self.restaurantList.append(view)
                elif topic[0] == "1":
                    self.spotList.append(view)
Ejemplo n.º 2
0
def main():
    data = load_json('new_term.json')
    new_data = {}
    for k, v in data.items():
        #print(k, v)
        #input()
        if v['popularity'] != 0 and v['coord'][0] != 0:
            new_data[k] = data[k]
    write_json(new_data, 'new_term.json')
Ejemplo n.º 3
0
def main():
    data = load_json('new_term.json')
    new_data = {}
    for k, v in data.items():
        #print(k, v)
        #input()
        if v['popularity'] != 0 and v['coord'][0] != 0:
            new_data[k] = data[k]
    write_json(new_data, 'new_term.json')
Ejemplo n.º 4
0
def infer(collection, term):
    print('Query term:', term)

    result_list = search_db(collection, term)
    popularity = len(result_list)
    if popularity == 0:
        return [], popularity, '', {}, {}, []
    print('Found articles (popularity): ' + str(popularity))

    atopic = load_json('article_topic.json')
    topic_count = {}
    topic_article = {}
    total = 0
    for s in result_list:
        a_id = s['article_id']
        a_date = s['date']
        if a_id in atopic.keys():
            #print(atopic[s['article_id']])
            for t in atopic[a_id]:
                total += 1
                if t in topic_count.keys():
                    topic_count[t] += 1
                else:
                    topic_count[t] = 1
                if t in topic_article.keys():
                    if (a_id, len(atopic[a_id])) not in topic_article[t]:
                        topic_article[t].append(
                            (a_id, len(atopic[a_id]), a_date))
                else:
                    topic_article[t] = [(a_id, len(atopic[a_id]), a_date)]

    #print(topic_article)
    for k, v in topic_article.items():
        topic_article[k] = sorted(v, key=lambda x: (x[1], -(x[2].toordinal())))
    #print(topic_article)
    article = {}
    for k, v in topic_article.items():
        article[k] = [ele[0] for ele in v]
    #print(article)

    topic_list = []
    for k, v in topic_count.items():
        topic_list.append((k, v, round(v / float(total), 4)))
    topic_list = sorted(topic_list, key=lambda x: x[1], reverse=True)

    ref = []
    for _t in topic_list:
        ref += article[_t[0]]
        if len(ref) > 2:
            break
    ref = ref[:3]

    coord = translate_location(term)

    url = 'https://www.ptt.cc/bbs/Japan_Travel/'
    return topic_list, popularity, url, article, ref, coord
Ejemplo n.º 5
0
def infer(collection, term):
    print('Query term:', term)

    result_list = search_db(collection, term)
    popularity = len(result_list)
    if popularity == 0:
        return [], popularity, '', {}, {}, []
    print('Found articles (popularity): ' + str(popularity))
    
    atopic = load_json('article_topic.json')
    topic_count = {}
    topic_article = {}
    total = 0
    for s in result_list:
        a_id = s['article_id']
        a_date = s['date']
        if a_id in atopic.keys():
            #print(atopic[s['article_id']])
            for t in atopic[a_id]:
                total += 1
                if t in topic_count.keys():
                    topic_count[t] += 1
                else:
                    topic_count[t] = 1
                if t in topic_article.keys():
                    if (a_id, len(atopic[a_id])) not in topic_article[t]:
                        topic_article[t].append((a_id, len(atopic[a_id]), a_date))
                else:
                    topic_article[t] = [(a_id, len(atopic[a_id]), a_date)]
    
    #print(topic_article)
    for k, v in topic_article.items():
        topic_article[k] = sorted(v, key=lambda x:(x[1], -(x[2].toordinal())))
    #print(topic_article)
    article = {}
    for k, v in topic_article.items():
        article[k] = [ele[0] for ele in v]
    #print(article)

    topic_list = []
    for k, v in topic_count.items():
        topic_list.append((k, v, round(v/float(total), 4)))
    topic_list = sorted(topic_list, key=lambda x:x[1], reverse=True)

    ref = []
    for _t in topic_list:
        ref += article[_t[0]]
        if len(ref)>2:
            break
    ref = ref[:3]

    coord = translate_location(term)

    url = 'https://www.ptt.cc/bbs/Japan_Travel/'
    return topic_list, popularity, url, article, ref, coord
Ejemplo n.º 6
0
def main():
    board = 'Japan_Travel'
    conn = MongoClient('localhost', 27017)
    db = conn['bdhackthon']
    collection = db[board]
    d_start = datetime.datetime(2016, 1, 1, 0)
    d_end = datetime.datetime(2016, 3, 1, 0)

    t_start = time.time()
    # bulil testing corpus
    if os.path.exists('testing_corpus_data.json'):
        testing_corpus_data = load_json('testing_corpus_data.json')
    else:
        testing_corpus_data = {}
    testing_corpus = []
    articles = collection.find(
        {
            "$or": [{
                "article_title": {
                    "$regex": "\[[遊食]記\].*(東京)+.*"
                },
                "date": {
                    "$lt": d_start
                }
            }, {
                "article_title": {
                    "$regex": "\[住宿\].*(東京)+.*"
                },
                "date": {
                    "$lt": d_start
                }
            }, {
                "article_title": {
                    "$regex": "\[[遊食]記\].*(東京)+.*"
                },
                "date": {
                    "$gt": d_end
                }
            }, {
                "article_title": {
                    "$regex": "\[住宿\].*(東京)+.*"
                },
                "date": {
                    "$gt": d_end
                }
            }]
        },
        no_cursor_timeout=True).batch_size(20)

    print('Total:', articles.count())

    i = 0
    tmp_data = {}
    for article in articles:
        #if i==7:
        #    break
        tmp_data[article['article_id']] = (article['article_title'],
                                           article['content'])
        print(i)
        #print(article, article['article_title'])
        print(article['article_title'])
        #print(article['content'])
        #print(article)
        if article['article_id'] in testing_corpus_data.keys():
            testing_corpus.append(
                testing_corpus_data[article['article_id']]['feature'])
            i = i + 1
            continue
        else:
            doc = []
            doc += splitWord(article['article_title'])
            doc += splitWord(article['content'])
            testing_corpus_data[article['article_id']] = {
                'feature': doc,
                'topic': [],
                'index': i
            }
            testing_corpus.append(doc)
            i = i + 1
        #input()
    t_end = time.time()
    write_json(testing_corpus_data, 'testing_corpus_data.json')
    print('time elapsed for building corpus: %f minutes' %
          ((t_end - t_start) / 60.0))

    print('Inferring')
    category = load_json('category.json')
    dictionary = corpora.Dictionary.load('train.dict')
    corpus_bow = corpora.MmCorpus('train.mm')
    tfidf = models.TfidfModel.load('train.tfidf')
    lda = models.ldamodel.LdaModel.load('train.lda')
    index = similarities.MatrixSimilarity.load('train.index')

    training_corpus_data = load_json('corpus_data.json')

    for testing_aid in testing_corpus_data.keys():
        #print(testing_corpus_data[testing_aid]['feature'])
        print(testing_aid)
        vec_bow = dictionary.doc2bow(
            testing_corpus_data[testing_aid]['feature'])
        vec_tfidf = tfidf[vec_bow]
        vec_lda = lda[vec_tfidf]  # convert the query to LDA space
        sims = index[vec_lda]  # perform a similarity query against the corpus
        sims = sorted(enumerate(sims), key=lambda item: -item[1])
        #print(sims)
        training_index = str(sims[0][0])
        for training_aid in training_corpus_data.keys():
            if str(training_corpus_data[training_aid]
                   ['index']) == training_index:
                testing_corpus_data[testing_aid][
                    'topic'] = training_corpus_data[training_aid]['topic']
                #for t in testing_corpus_data[testing_aid]['topic']:
                #print(category[t])
                break
    write_json(testing_corpus_data, 'testing_corpus_data_label.json')
Ejemplo n.º 7
0
def main():
    board = 'Japan_Travel'
    conn = MongoClient('localhost', 27017)
    db = conn['bdhackthon']
    collection = db[board]
    d_start = datetime.datetime(2016, 1, 1, 0)
    d_end = datetime.datetime(2016, 3, 1, 0)

    t_start = time.time()
    # bulil corpus
    if os.path.exists('corpus_data.json'):
        corpus_data = load_json('corpus_data.json')
    else:
        corpus_data = {}
    corpus = []
    articles = collection.find({
        "$or":[
        {"article_title": {"$regex": "\[[遊食]記\].*(東京)+.*"}, "date": {"$gt": d_start, "$lt": d_end}},
        {"article_title": {"$regex": "\[住宿\].*(東京)+.*"}, "date": {"$gt": d_start, "$lt": d_end}}]
    }, no_cursor_timeout=True).batch_size(20)
    print('Total:', articles.count())
    index_aid = {}  # map index of corpus to article_id
    i = 0
    tmp_data = {}
    for article in articles:
        #if i==80:
        #    break
        tmp_data[article['article_id']] = (article['article_title'], article['content'])
        index_aid[str(i)] = article['article_id']
        print(i)
        #print(article, article['article_title'])
        print(article['article_title'])
        #print(article['content'])
        #print(article)
        if article['article_id'] in corpus_data.keys():
            corpus.append(corpus_data[article['article_id']]['feature'])
            corpus_data[article['article_id']]['index'] = i
            i = i+1
            continue
        else:
            doc = []
            doc += splitWord(article['article_title'])
            doc += splitWord(article['content'])
            corpus_data[article['article_id']] = {
                'feature': doc,
                'topic': [],
                'index': i
            }
            corpus.append(doc)
            i = i+1
        #input()
    t_end = time.time()
    write_json(corpus_data, 'corpus_data.json')
    print('time elapsed for building corpus: %f minutes' % ((t_end-t_start)/60.0))

    dictionary = corpora.Dictionary(corpus)
    stoplist = [line.lower().split()[0] for line in open('stop_words.txt', 'r')]
    # remove stop words and words that appear only once
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
    #once_ids = []
    dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
    dictionary.compactify() # remove gaps in id sequence after words that were removed
    #print(dictionary)
    #print(dictionary.dfs)
    #pprint(dictionary.token2id)
    dictionary.save('train.dict')  # store the dictionary, for future reference
    
    corpus_bow = [dictionary.doc2bow(doc) for doc in corpus]
    corpora.MmCorpus.serialize('train.mm', corpus_bow) # store to disk, for later use
    
    tfidf = models.TfidfModel(corpus_bow) # initialize (train) a model
    tfidf.save('train.tfidf')
    corpus_tfidf = tfidf[corpus_bow]
    
    lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, alpha='auto', num_topics=50)
    #print(lda.print_topics(50))
    lda.save('train.lda')
    corpus_lda = lda[corpus_tfidf]
    index = similarities.MatrixSimilarity(corpus_lda)  # transform corpus to LDA space and index it
    index.save('train.index')
    
    topic = {}
    for i in range(len(corpus_lda)):
        #print(corpus_lda[i])
        #print(corpus[i])
        key = max(corpus_lda[i], key=lambda x: abs(x[1]))[0]
        if key in topic.keys():
            topic[key].append(i)
        else:
            topic[key] = [i]
        #input()
    
    vec_topic = {}
    print('%d topics identified. Classify them:' % len(topic))

    old_corpus_data = load_json('old_model/corpus_data.json')
    for k, v in topic.items():
        print('Group %s (%d):' % (k, len(v)))
        for c_index in v:
            a_id = index_aid[str(c_index)]
            #if a_id in corpus_data.keys():
            if a_id in old_corpus_data.keys():
                #print(corpus_data[a_id]['topic'])
                if not old_corpus_data[a_id]['topic']:
                    #print(corpus_data[a_id]['feature'])
                    print(tmp_data[a_id])
                    line = input('Enter topics, separate by space: ')
                    corpus_data[a_id]['topic'] = line.split(' ')
                else:
                    corpus_data[a_id]['topic'] = old_corpus_data[a_id]['topic']
            else:
                raise ValueError('Empty article_id')
        write_json(corpus_data, 'corpus_data_labeled.json')
Ejemplo n.º 8
0
def main():
    board = 'Japan_Travel'
    conn = MongoClient('localhost', 27017)
    db = conn['bdhackthon']
    collection = db[board]
    d_start = datetime.datetime(2016, 1, 1, 0)
    d_end = datetime.datetime(2016, 3, 1, 0)

    t_start = time.time()
    # bulil corpus
    if os.path.exists('corpus_data.json'):
        corpus_data = load_json('corpus_data.json')
    else:
        corpus_data = {}
    corpus = []
    articles = collection.find(
        {
            "$or": [{
                "article_title": {
                    "$regex": "\[[遊食]記\].*(東京)+.*"
                },
                "date": {
                    "$gt": d_start,
                    "$lt": d_end
                }
            }, {
                "article_title": {
                    "$regex": "\[住宿\].*(東京)+.*"
                },
                "date": {
                    "$gt": d_start,
                    "$lt": d_end
                }
            }]
        },
        no_cursor_timeout=True).batch_size(20)
    print('Total:', articles.count())
    index_aid = {}  # map index of corpus to article_id
    i = 0
    tmp_data = {}
    for article in articles:
        #if i==80:
        #    break
        tmp_data[article['article_id']] = (article['article_title'],
                                           article['content'])
        index_aid[str(i)] = article['article_id']
        print(i)
        #print(article, article['article_title'])
        print(article['article_title'])
        #print(article['content'])
        #print(article)
        if article['article_id'] in corpus_data.keys():
            corpus.append(corpus_data[article['article_id']]['feature'])
            corpus_data[article['article_id']]['index'] = i
            i = i + 1
            continue
        else:
            doc = []
            doc += splitWord(article['article_title'])
            doc += splitWord(article['content'])
            corpus_data[article['article_id']] = {
                'feature': doc,
                'topic': [],
                'index': i
            }
            corpus.append(doc)
            i = i + 1
        #input()
    t_end = time.time()
    write_json(corpus_data, 'corpus_data.json')
    print('time elapsed for building corpus: %f minutes' %
          ((t_end - t_start) / 60.0))

    dictionary = corpora.Dictionary(corpus)
    stoplist = [
        line.lower().split()[0] for line in open('stop_words.txt', 'r')
    ]
    # remove stop words and words that appear only once
    stop_ids = [
        dictionary.token2id[stopword] for stopword in stoplist
        if stopword in dictionary.token2id
    ]
    once_ids = [
        tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1
    ]
    #once_ids = []
    dictionary.filter_tokens(
        stop_ids +
        once_ids)  # remove stop words and words that appear only once
    dictionary.compactify(
    )  # remove gaps in id sequence after words that were removed
    #print(dictionary)
    #print(dictionary.dfs)
    #pprint(dictionary.token2id)
    dictionary.save('train.dict')  # store the dictionary, for future reference

    corpus_bow = [dictionary.doc2bow(doc) for doc in corpus]
    corpora.MmCorpus.serialize('train.mm',
                               corpus_bow)  # store to disk, for later use

    tfidf = models.TfidfModel(corpus_bow)  # initialize (train) a model
    tfidf.save('train.tfidf')
    corpus_tfidf = tfidf[corpus_bow]

    lda = models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                   id2word=dictionary,
                                   alpha='auto',
                                   num_topics=50)
    #print(lda.print_topics(50))
    lda.save('train.lda')
    corpus_lda = lda[corpus_tfidf]
    index = similarities.MatrixSimilarity(
        corpus_lda)  # transform corpus to LDA space and index it
    index.save('train.index')

    topic = {}
    for i in range(len(corpus_lda)):
        #print(corpus_lda[i])
        #print(corpus[i])
        key = max(corpus_lda[i], key=lambda x: abs(x[1]))[0]
        if key in topic.keys():
            topic[key].append(i)
        else:
            topic[key] = [i]
        #input()

    vec_topic = {}
    print('%d topics identified. Classify them:' % len(topic))

    old_corpus_data = load_json('old_model/corpus_data.json')
    for k, v in topic.items():
        print('Group %s (%d):' % (k, len(v)))
        for c_index in v:
            a_id = index_aid[str(c_index)]
            #if a_id in corpus_data.keys():
            if a_id in old_corpus_data.keys():
                #print(corpus_data[a_id]['topic'])
                if not old_corpus_data[a_id]['topic']:
                    #print(corpus_data[a_id]['feature'])
                    print(tmp_data[a_id])
                    line = input('Enter topics, separate by space: ')
                    corpus_data[a_id]['topic'] = line.split(' ')
                else:
                    corpus_data[a_id]['topic'] = old_corpus_data[a_id]['topic']
            else:
                raise ValueError('Empty article_id')
        write_json(corpus_data, 'corpus_data_labeled.json')
Ejemplo n.º 9
0
    url = 'https://www.ptt.cc/bbs/Japan_Travel/'
    return topic_list, popularity, url, article, ref, coord


if __name__ == '__main__':
    client = MongoClient('localhost', 27017)
    collection = client['bdhackthon']['Japan_Travel']

    new_term_data = {}
    terms = set()
    with open('candidate.txt', encoding='utf-8') as f:
        place_names = f.readlines()
        for name in place_names:
            terms.add(name.strip())
    term_data = load_json('term.json')
    for k in term_data.keys():
        terms.add(k)
    print(len(terms), terms)
    for term in terms:
        geocode = get_geocode(term)
        if len(geocode) > 0:
            loc = geocode[0].get('geometry').get('location')
            nearby_places = get_nearby(loc, radius=1000)
            total_price = 0
            num_price = 0
            for place in nearby_places:
                if place.get('price_level'):
                    total_price += place.get('price_level')
                    num_price += 1
            if num_price == 0:
Ejemplo n.º 10
0
    url = 'https://www.ptt.cc/bbs/Japan_Travel/'
    return topic_list, popularity, url, article, ref, coord


if __name__ == '__main__':
    client = MongoClient('localhost', 27017)
    collection = client['bdhackthon']['Japan_Travel']
    
    new_term_data = {}
    terms = set()
    with open('candidate.txt', encoding='utf-8') as f:
        place_names = f.readlines()
        for name in place_names:
            terms.add(name.strip())
    term_data = load_json('term.json')
    for k in term_data.keys():
        terms.add(k)
    print(len(terms), terms)
    for term in terms:
        geocode = get_geocode(term)
        if len(geocode) > 0:
            loc = geocode[0].get('geometry').get('location')
            nearby_places = get_nearby(loc, radius=1000)
            total_price = 0
            num_price = 0
            for place in nearby_places:
                if place.get('price_level'):
                    total_price += place.get('price_level')
                    num_price += 1
            if num_price == 0:
Ejemplo n.º 11
0
    '一蘭拉麵': {
        'topic': ['美食'],
        'synonym': [],
        'coord': (32.1123123123, 34.12321334),
        'popularity': 50,
    },
    '淺草寺': {
        'topic': ['景點', '攝影', '古蹟'],
        'synonym': [],
        'coord': (32.1123123123, 34.12321334),
        'popularity': 200,
    }
}
'''

'''
category = load_json('category.json')
# do what you want about category
write_json(category, 'category.json')
'''

term = load_json('term.json')
for key in term.keys():
    term[key]['ref'] = [
        'https://www.ptt.cc/bbs/Japan_Travel/M.1461767347.A.EF8.html',
        'https://www.ptt.cc/bbs/Japan_Travel/M.1461767347.A.EF8.html',
        'https://www.ptt.cc/bbs/Japan_Travel/M.1461767347.A.EF8.html'
    ]

write_json(term, 'term.json')