def loadTerms(self): termFile = os.path.join(srcDir, "nlp", "new_term.json") terms = web_util.load_json(termFile) airportFile = os.path.join(srcDir, "nlp", "airport.json") airport = web_util.load_json(airportFile) hotelFile = os.path.join(srcDir, "nlp", "hotel.json") hotels = web_util.load_json(hotelFile) self.restaurantList = [] self.spotList = [] self.hotelList = [] for air in airport: self.airport = View((air), airport[air]) for hotel in hotels: view = View(hotel, hotels[hotel]) self.hotelList.append(view) for term in terms: if terms[term]["popularity"] == 0: # ignore continue view = View(term, terms[term]) for topic in view.topicList: if topic[0] == "0": self.restaurantList.append(view) elif topic[0] == "1": self.spotList.append(view)
def main(): data = load_json('new_term.json') new_data = {} for k, v in data.items(): #print(k, v) #input() if v['popularity'] != 0 and v['coord'][0] != 0: new_data[k] = data[k] write_json(new_data, 'new_term.json')
def infer(collection, term): print('Query term:', term) result_list = search_db(collection, term) popularity = len(result_list) if popularity == 0: return [], popularity, '', {}, {}, [] print('Found articles (popularity): ' + str(popularity)) atopic = load_json('article_topic.json') topic_count = {} topic_article = {} total = 0 for s in result_list: a_id = s['article_id'] a_date = s['date'] if a_id in atopic.keys(): #print(atopic[s['article_id']]) for t in atopic[a_id]: total += 1 if t in topic_count.keys(): topic_count[t] += 1 else: topic_count[t] = 1 if t in topic_article.keys(): if (a_id, len(atopic[a_id])) not in topic_article[t]: topic_article[t].append( (a_id, len(atopic[a_id]), a_date)) else: topic_article[t] = [(a_id, len(atopic[a_id]), a_date)] #print(topic_article) for k, v in topic_article.items(): topic_article[k] = sorted(v, key=lambda x: (x[1], -(x[2].toordinal()))) #print(topic_article) article = {} for k, v in topic_article.items(): article[k] = [ele[0] for ele in v] #print(article) topic_list = [] for k, v in topic_count.items(): topic_list.append((k, v, round(v / float(total), 4))) topic_list = sorted(topic_list, key=lambda x: x[1], reverse=True) ref = [] for _t in topic_list: ref += article[_t[0]] if len(ref) > 2: break ref = ref[:3] coord = translate_location(term) url = 'https://www.ptt.cc/bbs/Japan_Travel/' return topic_list, popularity, url, article, ref, coord
def infer(collection, term): print('Query term:', term) result_list = search_db(collection, term) popularity = len(result_list) if popularity == 0: return [], popularity, '', {}, {}, [] print('Found articles (popularity): ' + str(popularity)) atopic = load_json('article_topic.json') topic_count = {} topic_article = {} total = 0 for s in result_list: a_id = s['article_id'] a_date = s['date'] if a_id in atopic.keys(): #print(atopic[s['article_id']]) for t in atopic[a_id]: total += 1 if t in topic_count.keys(): topic_count[t] += 1 else: topic_count[t] = 1 if t in topic_article.keys(): if (a_id, len(atopic[a_id])) not in topic_article[t]: topic_article[t].append((a_id, len(atopic[a_id]), a_date)) else: topic_article[t] = [(a_id, len(atopic[a_id]), a_date)] #print(topic_article) for k, v in topic_article.items(): topic_article[k] = sorted(v, key=lambda x:(x[1], -(x[2].toordinal()))) #print(topic_article) article = {} for k, v in topic_article.items(): article[k] = [ele[0] for ele in v] #print(article) topic_list = [] for k, v in topic_count.items(): topic_list.append((k, v, round(v/float(total), 4))) topic_list = sorted(topic_list, key=lambda x:x[1], reverse=True) ref = [] for _t in topic_list: ref += article[_t[0]] if len(ref)>2: break ref = ref[:3] coord = translate_location(term) url = 'https://www.ptt.cc/bbs/Japan_Travel/' return topic_list, popularity, url, article, ref, coord
def main(): board = 'Japan_Travel' conn = MongoClient('localhost', 27017) db = conn['bdhackthon'] collection = db[board] d_start = datetime.datetime(2016, 1, 1, 0) d_end = datetime.datetime(2016, 3, 1, 0) t_start = time.time() # bulil testing corpus if os.path.exists('testing_corpus_data.json'): testing_corpus_data = load_json('testing_corpus_data.json') else: testing_corpus_data = {} testing_corpus = [] articles = collection.find( { "$or": [{ "article_title": { "$regex": "\[[遊食]記\].*(東京)+.*" }, "date": { "$lt": d_start } }, { "article_title": { "$regex": "\[住宿\].*(東京)+.*" }, "date": { "$lt": d_start } }, { "article_title": { "$regex": "\[[遊食]記\].*(東京)+.*" }, "date": { "$gt": d_end } }, { "article_title": { "$regex": "\[住宿\].*(東京)+.*" }, "date": { "$gt": d_end } }] }, no_cursor_timeout=True).batch_size(20) print('Total:', articles.count()) i = 0 tmp_data = {} for article in articles: #if i==7: # break tmp_data[article['article_id']] = (article['article_title'], article['content']) print(i) #print(article, article['article_title']) print(article['article_title']) #print(article['content']) #print(article) if article['article_id'] in testing_corpus_data.keys(): testing_corpus.append( testing_corpus_data[article['article_id']]['feature']) i = i + 1 continue else: doc = [] doc += splitWord(article['article_title']) doc += splitWord(article['content']) testing_corpus_data[article['article_id']] = { 'feature': doc, 'topic': [], 'index': i } testing_corpus.append(doc) i = i + 1 #input() t_end = time.time() write_json(testing_corpus_data, 'testing_corpus_data.json') print('time elapsed for building corpus: %f minutes' % ((t_end - t_start) / 60.0)) print('Inferring') category = load_json('category.json') dictionary = corpora.Dictionary.load('train.dict') corpus_bow = corpora.MmCorpus('train.mm') tfidf = models.TfidfModel.load('train.tfidf') lda = models.ldamodel.LdaModel.load('train.lda') index = similarities.MatrixSimilarity.load('train.index') training_corpus_data = load_json('corpus_data.json') for testing_aid in testing_corpus_data.keys(): #print(testing_corpus_data[testing_aid]['feature']) print(testing_aid) vec_bow = dictionary.doc2bow( testing_corpus_data[testing_aid]['feature']) vec_tfidf = tfidf[vec_bow] vec_lda = lda[vec_tfidf] # convert the query to LDA space sims = index[vec_lda] # perform a similarity query against the corpus sims = sorted(enumerate(sims), key=lambda item: -item[1]) #print(sims) training_index = str(sims[0][0]) for training_aid in training_corpus_data.keys(): if str(training_corpus_data[training_aid] ['index']) == training_index: testing_corpus_data[testing_aid][ 'topic'] = training_corpus_data[training_aid]['topic'] #for t in testing_corpus_data[testing_aid]['topic']: #print(category[t]) break write_json(testing_corpus_data, 'testing_corpus_data_label.json')
def main(): board = 'Japan_Travel' conn = MongoClient('localhost', 27017) db = conn['bdhackthon'] collection = db[board] d_start = datetime.datetime(2016, 1, 1, 0) d_end = datetime.datetime(2016, 3, 1, 0) t_start = time.time() # bulil corpus if os.path.exists('corpus_data.json'): corpus_data = load_json('corpus_data.json') else: corpus_data = {} corpus = [] articles = collection.find({ "$or":[ {"article_title": {"$regex": "\[[遊食]記\].*(東京)+.*"}, "date": {"$gt": d_start, "$lt": d_end}}, {"article_title": {"$regex": "\[住宿\].*(東京)+.*"}, "date": {"$gt": d_start, "$lt": d_end}}] }, no_cursor_timeout=True).batch_size(20) print('Total:', articles.count()) index_aid = {} # map index of corpus to article_id i = 0 tmp_data = {} for article in articles: #if i==80: # break tmp_data[article['article_id']] = (article['article_title'], article['content']) index_aid[str(i)] = article['article_id'] print(i) #print(article, article['article_title']) print(article['article_title']) #print(article['content']) #print(article) if article['article_id'] in corpus_data.keys(): corpus.append(corpus_data[article['article_id']]['feature']) corpus_data[article['article_id']]['index'] = i i = i+1 continue else: doc = [] doc += splitWord(article['article_title']) doc += splitWord(article['content']) corpus_data[article['article_id']] = { 'feature': doc, 'topic': [], 'index': i } corpus.append(doc) i = i+1 #input() t_end = time.time() write_json(corpus_data, 'corpus_data.json') print('time elapsed for building corpus: %f minutes' % ((t_end-t_start)/60.0)) dictionary = corpora.Dictionary(corpus) stoplist = [line.lower().split()[0] for line in open('stop_words.txt', 'r')] # remove stop words and words that appear only once stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id] once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1] #once_ids = [] dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once dictionary.compactify() # remove gaps in id sequence after words that were removed #print(dictionary) #print(dictionary.dfs) #pprint(dictionary.token2id) dictionary.save('train.dict') # store the dictionary, for future reference corpus_bow = [dictionary.doc2bow(doc) for doc in corpus] corpora.MmCorpus.serialize('train.mm', corpus_bow) # store to disk, for later use tfidf = models.TfidfModel(corpus_bow) # initialize (train) a model tfidf.save('train.tfidf') corpus_tfidf = tfidf[corpus_bow] lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, alpha='auto', num_topics=50) #print(lda.print_topics(50)) lda.save('train.lda') corpus_lda = lda[corpus_tfidf] index = similarities.MatrixSimilarity(corpus_lda) # transform corpus to LDA space and index it index.save('train.index') topic = {} for i in range(len(corpus_lda)): #print(corpus_lda[i]) #print(corpus[i]) key = max(corpus_lda[i], key=lambda x: abs(x[1]))[0] if key in topic.keys(): topic[key].append(i) else: topic[key] = [i] #input() vec_topic = {} print('%d topics identified. Classify them:' % len(topic)) old_corpus_data = load_json('old_model/corpus_data.json') for k, v in topic.items(): print('Group %s (%d):' % (k, len(v))) for c_index in v: a_id = index_aid[str(c_index)] #if a_id in corpus_data.keys(): if a_id in old_corpus_data.keys(): #print(corpus_data[a_id]['topic']) if not old_corpus_data[a_id]['topic']: #print(corpus_data[a_id]['feature']) print(tmp_data[a_id]) line = input('Enter topics, separate by space: ') corpus_data[a_id]['topic'] = line.split(' ') else: corpus_data[a_id]['topic'] = old_corpus_data[a_id]['topic'] else: raise ValueError('Empty article_id') write_json(corpus_data, 'corpus_data_labeled.json')
def main(): board = 'Japan_Travel' conn = MongoClient('localhost', 27017) db = conn['bdhackthon'] collection = db[board] d_start = datetime.datetime(2016, 1, 1, 0) d_end = datetime.datetime(2016, 3, 1, 0) t_start = time.time() # bulil corpus if os.path.exists('corpus_data.json'): corpus_data = load_json('corpus_data.json') else: corpus_data = {} corpus = [] articles = collection.find( { "$or": [{ "article_title": { "$regex": "\[[遊食]記\].*(東京)+.*" }, "date": { "$gt": d_start, "$lt": d_end } }, { "article_title": { "$regex": "\[住宿\].*(東京)+.*" }, "date": { "$gt": d_start, "$lt": d_end } }] }, no_cursor_timeout=True).batch_size(20) print('Total:', articles.count()) index_aid = {} # map index of corpus to article_id i = 0 tmp_data = {} for article in articles: #if i==80: # break tmp_data[article['article_id']] = (article['article_title'], article['content']) index_aid[str(i)] = article['article_id'] print(i) #print(article, article['article_title']) print(article['article_title']) #print(article['content']) #print(article) if article['article_id'] in corpus_data.keys(): corpus.append(corpus_data[article['article_id']]['feature']) corpus_data[article['article_id']]['index'] = i i = i + 1 continue else: doc = [] doc += splitWord(article['article_title']) doc += splitWord(article['content']) corpus_data[article['article_id']] = { 'feature': doc, 'topic': [], 'index': i } corpus.append(doc) i = i + 1 #input() t_end = time.time() write_json(corpus_data, 'corpus_data.json') print('time elapsed for building corpus: %f minutes' % ((t_end - t_start) / 60.0)) dictionary = corpora.Dictionary(corpus) stoplist = [ line.lower().split()[0] for line in open('stop_words.txt', 'r') ] # remove stop words and words that appear only once stop_ids = [ dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id ] once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1 ] #once_ids = [] dictionary.filter_tokens( stop_ids + once_ids) # remove stop words and words that appear only once dictionary.compactify( ) # remove gaps in id sequence after words that were removed #print(dictionary) #print(dictionary.dfs) #pprint(dictionary.token2id) dictionary.save('train.dict') # store the dictionary, for future reference corpus_bow = [dictionary.doc2bow(doc) for doc in corpus] corpora.MmCorpus.serialize('train.mm', corpus_bow) # store to disk, for later use tfidf = models.TfidfModel(corpus_bow) # initialize (train) a model tfidf.save('train.tfidf') corpus_tfidf = tfidf[corpus_bow] lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, alpha='auto', num_topics=50) #print(lda.print_topics(50)) lda.save('train.lda') corpus_lda = lda[corpus_tfidf] index = similarities.MatrixSimilarity( corpus_lda) # transform corpus to LDA space and index it index.save('train.index') topic = {} for i in range(len(corpus_lda)): #print(corpus_lda[i]) #print(corpus[i]) key = max(corpus_lda[i], key=lambda x: abs(x[1]))[0] if key in topic.keys(): topic[key].append(i) else: topic[key] = [i] #input() vec_topic = {} print('%d topics identified. Classify them:' % len(topic)) old_corpus_data = load_json('old_model/corpus_data.json') for k, v in topic.items(): print('Group %s (%d):' % (k, len(v))) for c_index in v: a_id = index_aid[str(c_index)] #if a_id in corpus_data.keys(): if a_id in old_corpus_data.keys(): #print(corpus_data[a_id]['topic']) if not old_corpus_data[a_id]['topic']: #print(corpus_data[a_id]['feature']) print(tmp_data[a_id]) line = input('Enter topics, separate by space: ') corpus_data[a_id]['topic'] = line.split(' ') else: corpus_data[a_id]['topic'] = old_corpus_data[a_id]['topic'] else: raise ValueError('Empty article_id') write_json(corpus_data, 'corpus_data_labeled.json')
url = 'https://www.ptt.cc/bbs/Japan_Travel/' return topic_list, popularity, url, article, ref, coord if __name__ == '__main__': client = MongoClient('localhost', 27017) collection = client['bdhackthon']['Japan_Travel'] new_term_data = {} terms = set() with open('candidate.txt', encoding='utf-8') as f: place_names = f.readlines() for name in place_names: terms.add(name.strip()) term_data = load_json('term.json') for k in term_data.keys(): terms.add(k) print(len(terms), terms) for term in terms: geocode = get_geocode(term) if len(geocode) > 0: loc = geocode[0].get('geometry').get('location') nearby_places = get_nearby(loc, radius=1000) total_price = 0 num_price = 0 for place in nearby_places: if place.get('price_level'): total_price += place.get('price_level') num_price += 1 if num_price == 0:
'一蘭拉麵': { 'topic': ['美食'], 'synonym': [], 'coord': (32.1123123123, 34.12321334), 'popularity': 50, }, '淺草寺': { 'topic': ['景點', '攝影', '古蹟'], 'synonym': [], 'coord': (32.1123123123, 34.12321334), 'popularity': 200, } } ''' ''' category = load_json('category.json') # do what you want about category write_json(category, 'category.json') ''' term = load_json('term.json') for key in term.keys(): term[key]['ref'] = [ 'https://www.ptt.cc/bbs/Japan_Travel/M.1461767347.A.EF8.html', 'https://www.ptt.cc/bbs/Japan_Travel/M.1461767347.A.EF8.html', 'https://www.ptt.cc/bbs/Japan_Travel/M.1461767347.A.EF8.html' ] write_json(term, 'term.json')