def get_a_news(): news_id = flask.request.args.get('news_id') _id = flask.request.args.get('id') session = datasources.get_db().create_session() if _id is not None: # more fast news_detail = datasources.get_db().find_news_by_id(session, _id) else: news_detail = datasources.get_db().find_news_by_source_id( session, news_id) data = { 'news_id': news_id, 'review_num': news_detail.review_num, 'word_num': news_detail.word_num, 'abstract': news_detail.abstract, 'content': news_detail.content, 'keywords': news_detail.keywords, 'title': news_detail.title, 'url': news_detail.url, 'id': news_detail.id, 'media_name': news_detail.media_name, 'time': news_detail.time, 'related_id': news_detail.related_id } # print(data) datasources.get_db().close_session(session) return flask.jsonify(data)
def test_filter_by_coocurrence(self): word_text_samples = [ "abate", "bolster", "buttress", "champion", "defend", "espouse", "support" ] for i, j in [(0, 1), (0, 2), (0, 3), (0, 2), (1, 2), (1, 3), (3, 4)]: datasources.get_db().upsert_news_or_news_list( self.session, entities.news.NewsPlain(title='', content=' '.join([ word_text_samples[i], word_text_samples[j] ]))) indexes.IndexHolder().word_coocurrence_index.init(force_refresh=True) self.assertListEqual( filters.filter_by_coocurrence([ word_texts for word_texts in itertools.combinations(word_text_samples, 3) ], 3), [(word_text_samples[0], word_text_samples[1], word_text_samples[2]), (word_text_samples[0], word_text_samples[1], word_text_samples[3]), (word_text_samples[0], word_text_samples[2], word_text_samples[3]) ])
def test_posting_index(self): news_list = [ entities.news.NewsPlain(source_id=1), entities.news.NewsPlain(source_id=2) ] news_list = datasources.get_db().upsert_news_or_news_list( self.session, news_list) word_posting_list = [ entities.words.WordPosting(news_id=news_list[0].id), entities.words.WordPosting(news_id=news_list[0].id), entities.words.WordPosting(news_id=news_list[1].id) ] words_list = [ entities.words.Word(text="a"), entities.words.Word(text="b") ] words_list[0].posting_list = word_posting_list[:1] words_list[1].posting_list = word_posting_list[1:] words_list = datasources.get_db().upsert_word_or_word_list( self.session, words_list) index = indexes.posting_index.PostingIndex() index.init(force_refresh=True) self.assertEqual( len( index.collect([words_list[0].id, words_list[1].id], index.LogicAction.OR)), 2) self.assertEqual( len( index.collect([words_list[0].id, words_list[1].id], index.LogicAction.AND)), 1)
def test_find_news_plain_text(self): datasources.get_db().upsert_news_or_news_list(self.session, self.news_sample) texts_df = datasources.get_db().find_news_plain_text(self.session) self.assertTrue(isinstance(texts_df, pandas.DataFrame)) self.assertEqual(texts_df.shape, (1, 3)) self.assertEqual(texts_df.title[0], self.news_sample.title) self.assertEqual(texts_df.content[0], self.news_sample.content)
def runSQL(): session = datasources.get_db().create_session() # Open the .sql file sql_file = open("dump.sql", 'r') # Create an empty command string sql_command = '' # Iterate over all lines in the sql file for line in sql_file: # Ignore comented lines if not line.startswith('--') and line.strip('\n'): # Append line to the command string sql_command += line.strip('\n') # If the command string ends with ';', it is a full statement if sql_command.endswith(';'): # Try to execute statemente and commit it try: session.execute(sql_command) session.commit() # Assert in case of error except Exception as e: print('Ops') # Finally, clear command string finally: sql_command = '' datasources.get_db().close_session(session)
def test_word_text_index_similar(self): word_text_samples = [ "野猫", "野小猫", "野生小猫", "猫", "野狗", "豹猫", "小野猫", "1000", "10", "001", "010", "0", "1", "01" ] for word_text in word_text_samples: datasources.get_db().upsert_word_or_word_list( self.session, entities.words.Word(text=word_text)) index = indexes.word_text_index.WordTextIndex() index.init(force_refresh=True) word_texts = index.collect("野猫", action=index.CollectionAction.SIMILAR, threshold=1) word_texts.sort() word_text_ans = ["野猫", "野小猫", "猫", "野狗", "豹猫", "小野猫"] word_text_ans.sort() self.assertListEqual(word_texts, word_text_ans) word_texts = index.collect("猫", action=index.CollectionAction.SIMILAR, threshold=2) word_texts.sort() word_text_ans = [ "0", "1", "猫", "01", "10", "野猫", "豹猫", "野狗", "小野猫", "野小猫" ] word_text_ans.sort() self.assertListEqual(word_texts, word_text_ans)
def test_find_word_plain_text_ordered_by_text(self): word_text_samples = ["1000", "10", "001", "010", "0", "1", "01"] for word_text in word_text_samples: datasources.get_db().upsert_word_or_word_list(self.session, entities.words.Word(text=word_text)) word_texts = datasources.get_db().find_word_plain_text_ordered_by_text(self.session) word_text_samples.sort() self.assertListEqual(word_texts, word_text_samples)
def setUp(self): """ In fact, it is informal way to build a TestCase like this. :return: """ datasources.get_db().recreate_all_tables() self.session = datasources.get_db().create_session() self.news_sample = entities.news.NewsPlain( source=entities.news.NewsPlain.SourceEnum.sina, source_id="comos-fynfvar5143551", url="http://news.sina.com.cn/s/wh/2017-10-30/doc-ifynfvar5143551.shtml", title="无牌宝马高速狂飙 警察截停后发现司机没有手", keywords="民警,宝马,无臂", media_name="大洋网", abstract="原标题:我伙呆!断臂“老司机”高速上驾宝马狂飙,副驾上还坐着他老婆", content="原标题:我伙呆!断臂“老司机”高速上驾宝马狂飙,副驾上还坐着他老婆\n一个失去双手小臂的大叔,却开着宝马带上妻子闯天涯。...", time=datetime.datetime.strptime("2017/10/30 12:28:54", "%Y/%m/%d %H:%M:%S"), review_num=1) self.review_sample = entities.review.ReviewPlain( user_id=3241538043, user_name="Adams_7276", area="江苏南京", content="“镜面人”大脑结构是不是也相反?", time=datetime.datetime.strptime("2017-10-30 11:31:05", "%Y-%m-%d %H:%M:%S"), agree=0) self.news_sample.reviews = [self.review_sample] self.word_posting_sample = entities.words.WordPosting(tf=3, title_positions=[1], content_positions=[1, 5]) self.word_sample = entities.words.Word(text="江苏", pos="N", df=3, cf=3) self.word_sample.posting_list = [self.word_posting_sample]
def update(self, num=20): self.sqlsession = datasources.get_db().create_session() self.crawl(num) self.prepossess() datasources.get_db().close_session(self.sqlsession) print('wait... we need to init IndexHolder...') indexes.IndexHolder().init(force_refresh=True) self.sqlsession = None
def test_filter_by_avgtfidf(self): # TODO: to be absorb word_text_samples = ["1000", "10", "001", "010", "0", "1", "01"] for i, word_text in enumerate(word_text_samples): datasources.get_db().upsert_word_or_word_list( self.session, entities.words.Word(text=word_text, df=1, cf=i + 1)) self.assertEqual(filters.filter_by_avgtfidf(word_text_samples, 3), word_text_samples[-1:-4:-1])
def test_upsert_news_or_news_list(self): """ ugly, but simple :return: """ datasources.get_db().upsert_news_or_news_list(self.session, self.news_sample) news_list = datasources.get_db().find_news_list(self.session) self.assertEqual(news_list[0].source_id, self.news_sample.source_id) self.assertIsNotNone(news_list[0].reviews[0].news_id) self.assertEqual(news_list[0].reviews[0].content, self.news_sample.reviews[0].content)
def related_news(): # related_id = flask.request.args.get('related_id') source_id = flask.request.args.get('source_id') session = datasources.get_db().create_session() data = functions.suggest.suggest_similar_news(session, source_id) datasources.get_db().close_session(session) return flask.jsonify({'content': data})
def test_vocab_index(self): word_text_samples = ["1000", "10", "001", "010", "0", "1", "01"] for word_text in word_text_samples: datasources.get_db().upsert_word_or_word_list( self.session, entities.words.Word(text=word_text)) index = indexes.vocab_index.VocabIndex() index.init() self.assertIsNotNone(index.collect("10")) self.assertEqual(len(index.collect("10").posting_list), 0) self.assertIsNone(index.collect("a"))
def test_word_text_index_prefix(self): word_text_samples = ["1000", "10", "001", "010", "0", "1", "01"] for word_text in word_text_samples: datasources.get_db().upsert_word_or_word_list( self.session, entities.words.Word(text=word_text)) index = indexes.word_text_index.WordTextIndex() index.init(force_refresh=True) word_texts = index.collect('', action=index.CollectionAction.PREFIX) word_texts.sort() word_text_samples.sort() self.assertListEqual(word_texts, word_text_samples)
def build(self, text_df=None): if text_df is None: sqlsession = datasources.get_db().create_session() text_df = config.get_spark_session().createDataFrame( datasources.get_db().find_news_plain_text(sqlsession)) datasources.get_db().close_session(sqlsession) train_data = update.segment.cut4synonym_index(text_df) word2vec = pyspark.mllib.feature.Word2Vec() self.model = word2vec.fit(train_data) self.model.save(config.get_spark_context(), config.indexes_config.word_synonym_model_cache_path)
def test_upsert_word_or_word_list(self): """ ugly, but simple :return: """ self.news_sample = datasources.get_db().upsert_news_or_news_list(self.session, self.news_sample) self.word_posting_sample.news_id = self.news_sample.id datasources.get_db().upsert_word_or_word_list(self.session, self.word_sample) word_list = datasources.get_db().find_word_list(self.session) self.assertEqual(word_list[0].text, self.word_sample.text) self.assertIsNotNone(word_list[0].posting_list[0].word_id) self.assertListEqual(word_list[0].posting_list[0].title_positions, self.word_posting_sample.title_positions)
def get_snippet(): news_id = int(flask.request.args.get("news_id")) search_text = flask.request.args.get("search_text") search_text = utils.utils.remove_wild_char(search_text) word_regex_list = search_text.split(' ') length = flask.request.args.get("length") if length is not None: length = int(length) session = datasources.get_db().create_session() ans = functions.snippet.gen_snippet_with_wildcard(session, word_regex_list, news_id, length) datasources.get_db().close_session(session) return ans, 200
def search(): query = flask.request.args.get("query") ranking = flask.request.args.get("ranking-by") page = flask.request.args.get('page') session = datasources.get_db().create_session() results_count, result_list, good_search_mode = functions.search.universal_search( session, query, int(ranking), int(page)) datasources.get_db().close_session(session) return flask.jsonify({ 'results_count': results_count, 'result_list': result_list, 'good_search_mode': good_search_mode })
def test_word_text_index(self): word_text_samples = [ "野猫", "野小猫", "野生小猫", "猫", "野狗", "豹猫", "小野猫", "1000", "10", "001", "010", "0", "1", "01", "1001" ] for word_text in word_text_samples: datasources.get_db().upsert_word_or_word_list( self.session, entities.words.Word(text=word_text)) index = indexes.word_text_index.WordTextIndex() index.init(force_refresh=True) word_texts = index.collect() word_texts.sort() word_text_samples.sort() self.assertListEqual(word_texts, word_text_samples)
def test_analyze_emotion4news(self): review_texts = ["真不错", "", "真差", "ABCDSFDSFADFA", '1' * 500] reviews = [ entities.review.ReviewPlain(content=text) for text in review_texts ] news_sample = entities.news.NewsPlain(reviews=reviews) session = datasources.get_db().create_session() news_sample = datasources.get_db().upsert_news_or_news_list( session, news_sample) self.assertEqual( len( functions.emotions.analyze_emotion4news( session, news_sample.id)), 5) datasources.get_db().close_session(session)
def suggest_similar_news(session, source_id): redis_op = datasources.get_redis().redis_op() test_news = datasources.get_db( ).find_news_abstract_and_content_by_source_id(session, source_id) test_data = [] if test_news.abstract is None: for w, pos, start, end in update.segment.tokenize(test_news.content): if pos.startswith('n'): test_data.append(w) if redis_op.exists('similar_news_from_hot_news'): return suggest_similar_news_predict( redis_op, test_news.abstract or test_data, True if test_news is None else False) p = redis_op.lrange('hot_news_list', 0, -1) p = [u.replace('\'', '"').replace('None', 'null') for u in p] p = [json.loads(u) for u in p] raw_corpora = [u['abstract'] for u in p] update.similar_text.corpora_process(raw_corpora) redis_op.set('similar_news_from_hot_news', 1) return suggest_similar_news_predict(redis_op, test_news.abstract or test_data, True if test_news is None else False)
def analyze_emotion4news(session, news_id): news = datasources.get_db().find_news_by_id(session, news_id) if news is None: raise my_exceptions.datasources_exceptions.NewsNotFoundException(news_id) ans = list() for review in news.reviews: ans.append(analyze_emotion4review(review.content)) return ans
def saving_foreachPartition(rd): import config import datasources import entities.words import logs.loggers logger = logs.loggers.LoggersHolder().get_logger("updater") logger.info(config.spark_config.testing) session = datasources.get_db().create_session() for text, pitr in rd: word_text = text[:text.rindex( '\t')] # text in fact is a word plus its part of speech. pos = text[text.rindex('\t') + 1:] posting_list = [] # word maps some document id. word = entities.words.Word(text=word_text, df=0, cf=0, pos=pos) for posting_j in pitr: # specific document record. tf = len(posting_j["title"]) + len( posting_j["content"] ) # term frequency of the word in this document. word.cf += tf word.df += 1 word_posting = entities.words.WordPosting( news_id=posting_j["news_id"], title_positions=posting_j["title"], content_positions=posting_j["content"], tf=tf) posting_list.append(word_posting) word.posting_list = posting_list datasources.get_db().upsert_word_or_word_list(session, word, commit_now=False) datasources.get_db().commit_session(session) datasources.get_db().close_session(session)
def test_word_cooccurrence_index(self): news_content_list = [ "Tom,Ann,Cindy,Dave", "Betty,Ann,Cindy,Cindy,Eve", "Eve,Eve,Fenn,Fenn" ] news_list = [ entities.news.NewsPlain(source_id=source_id, content=content, title="") for source_id, content in enumerate(news_content_list) ] datasources.get_db().upsert_news_or_news_list(self.session, news_list) index = indexes.word_cooccurrence_index.WordCoOccurrenceIndex() index.init(force_refresh=True) self.assertEqual(index.collect(["Ann", "Betty", "Cindy"]), index.collect(["Betty", "Ann", "Cindy"])) self.assertEqual(index.collect(["Ann", "Betty", "Cindy"]), index.collect(["Ann", "Cindy", "Dave"]))
def test_gen_snippet(self): news_text = "常用标点符号用法简表\n一、基本定义\n 句子,前后都有停顿,并带有一定的句调,表示相对完整的意义。句子前后或中间的停顿,在口头语言中,表现出来就是时间间隔,在书面语言中,就用标点符号来表示。一般来说,汉语中的句子分以下几种:\n 陈述句:用来说明事实的句子。\n 祈使句:用来要求听话人做某件事情的句子。\n 疑问句:用来提出问题的句子。\n 感叹句:用来抒发某种强烈感情的句子。\n 复句、分句:意思上有密切联系的小句子组织在一起构成一个大句子。这样的大句子叫复句,复句中的每个小句子叫分句。\n 构成句子的语言单位是词语,即词和短语(词组)。词即最小的能独立运用的语言单位。短语,即由两个或两个以上的词按一定的语法规则组成的表达一定意义的语言单位,也叫词组。\n 标点符号是书面语言的有机组成部分,是书面语言不可缺少的辅助工具。它帮助人们确切地表达思想感情和理解书面语言。" news_sample = entities.news.NewsPlain(content=utils.utils.remove_wild_char(news_text), abstract="ABSTRACT") session = datasources.get_db().create_session() news_sample = datasources.get_db().upsert_news_or_news_list(session, news_sample) snippet = functions.snippet.gen_snippet(session, ["陈述", "事实"], news_id=news_sample.id, length=20) self.assertIn("陈述句:用来说明事实的句子。", snippet) snippet = functions.snippet.gen_snippet(session, ["组织", "复句"], news_id=news_sample.id, length=50) self.assertIn("组织在一起构成一个大句子。这样的大句子叫复句", snippet) snippet = functions.snippet.gen_snippet(session, ["猫", "复句"], news_id=news_sample.id, length=20) self.assertIn("复句、分句:", snippet) snippet = functions.snippet.gen_snippet(session, ["常用", "简表"], news_id=news_sample.id, length=20) self.assertIn("常用标点符号用法简表", snippet) snippet = functions.snippet.gen_snippet(session, ["常用", "简表"], news_id=news_sample.id, length=5) self.assertIn("常用", snippet) snippet = functions.snippet.gen_snippet(session, ["猫"], news_id=news_sample.id, length=5) self.assertEqual("ABSTR", snippet) datasources.get_db().close_session(session)
def build(self): session = datasources.get_db().create_session() word_texts = datasources.get_db().find_word_plain_text_ordered_by_text(session) datasources.get_db().close_session(session) self.tree = Node() c = None tree_tmp = NodeTmp() register = dict() for i, word_text in enumerate(word_texts): if not word_text: continue if word_text[0] != c: if c is not None: self.tree.children[c] = tree_tmp.get_node() tree_tmp = NodeTmp() register = dict() c = word_text[0] tree_tmp.add(word_text[1:], register) if c is not None: self.tree.children[c] = tree_tmp.get_node()
def get_review(): new_id = flask.request.args.get('id') session = datasources.get_db().create_session() r = datasources.get_db().find_reviews_by_news_id(session, int(new_id)) datasources.get_db().close_session(session) data = [{ 'agree': review.agree, 'content': review.content, 'emotion': functions.emotions.analyze_emotion4review(review.content) } for review in r] positive_counts = 0 for review in data: positive_counts += review['emotion'] + 1 # negative: -1; positive : 0 return flask.jsonify({ 'review': data, 'emotions': positive_counts * 1.0 / len(data) })
def suggest_hot_news(session, page): """ check that documents in redis(cache) is not expired :param session: :return: """ # check first. # if expired, we should construct 1000 hot news again redis_op = datasources.get_redis().redis_op() EXPIRED = not redis_op.exists('hot_news_list') print('Expired:', EXPIRED) if EXPIRED: r = datasources.get_db().find_hot_news(session, 100) cache = [{ 'title': news.title.replace('"', '“').replace('\'', '“'), 'abstract': news.abstract.replace('"', '“').replace('\'', '“'), 'time': str(news.time), 'keywords': news.keywords.replace('"', '“').replace('\'', '“'), 'source_id': news.source_id } for news in r] # we should cache the variable cache into redis. redis_op.lpush('hot_news_list', *cache) redis_op.expire('hot_news_list', config.cache_config.expire) redis_op.delete('similar_news_from_hot_news') if len(r) > 10: candidate = cache[:10] else: candidate = cache return candidate else: # to read redis. llen = redis_op.llen('host_news_list') if (page - 1) * 10 > llen: candidate = [] elif page * 10 > llen: candidate = redis_op.lrange('hot_news_list', (page - 1) * 10, -1) else: candidate = redis_op.lrange('hot_news_list', (page - 1) * 10, page * 10) candidate = [ u.replace('\'', '"').replace('None', 'null') for u in candidate ] candidate = [ json.loads(u) for u in candidate ] # FIXME json.decoder.JSONDecodeError: Expecting ',' delimiter: line 1 column 18 (char 17) return candidate
def collect(self, word_ids, action=LogicAction.OR ): # there is not inexact top K collecting method. ans = dict() for word_id in word_ids: word_posting_list = datasources.get_db( ).find_word_posting_list_by_word_id(self.session, word_id) for word_posting in word_posting_list: if word_posting.news_id not in ans: ans[word_posting.news_id] = dict() ans[word_posting.news_id][word_id] = word_posting if action == PostingIndex.LogicAction.AND: ans = { news_id: ans[news_id] for news_id in ans if len(ans[news_id]) == len(word_ids) } return ans
def universal_search(session, search_text, ranking, page, num_in_page=10): # segment search_text. TODO word_regex_list = search_text.split(' ') word_regex_list = list(set(word_regex_list)) try: # print(word_regex_list) word_regex_list.remove('') except ValueError: pass stop_nature_list = ['', 'w', 'x', 'y', 'c'] word_processed_list = list() for u in word_regex_list: v = list() for w, pos, s, e in update.segment.tokenize(u): if pos == 'x' and w == '*': v.append(w) elif pos != 'x': if len(v) > 0 and v[-1].startswith('*'): v[-1] += w elif pos not in stop_nature_list: v.append(w) if len(v) > 0 and v[-1] == '*': if len(v) > 1: v[-2] += '*' else: v = [] word_processed_list += v # word_processed_list includes words segmented. fl = '*' in ''.join(word_processed_list) segment_word_4_search_list = list() word_regex_list = list() for u in word_processed_list: if '*' in u: word_regex_list.append(u) else: for v in update.segment.tokenize(u, mode='search'): if v[1] not in stop_nature_list: segment_word_4_search_list.append(v[0]) regex_search_list = list() if fl: regex_search_list = functions.suggest.suggest_similar_search( word_regex_list, 1)[0] try: regex_search_list.remove(':') except ValueError: pass segment_word_4_search_list += regex_search_list segment_word_4_search_list = list(set(segment_word_4_search_list)) print('segment for search: ', segment_word_4_search_list) key = segment_word_4_search_list ranking_set = search(segment_word_4_search_list, ranking) if ranking != RELEVENCE_RANKING: ranking_set = datasources.get_db().find_news_time_and_review_num_by_id( session, ranking_set) if ranking == HOT_RANKING: ranking_set = [(u.id, u.review_num) for u in ranking_set] else: ranking_set = [(u.id, u.time) for u in ranking_set] if ranking == TIME_INCREASE_RANKING: ranking_set.sort(key=lambda k: k[1]) else: ranking_set.sort(key=lambda k: k[1], reverse=True) # need to return Length of ranking_set,[news_brief] # import pdb # pdb.set_trace() candidate_id_list = [ u[0] for u in ranking_set[(page - 1) * num_in_page:page * num_in_page] ] if len(candidate_id_list) == 0: return 0, [], [' '.join(segment_word_4_search_list), regex_search_list] result_list = [ { 'news_id': row.source_id, 'title': row.title, # 'source': row.source.sina, 'time': row.time, 'id': row.id } for row in datasources.get_db().find_news_brief_by_id( session, candidate_id_list) ] for r in result_list: for k, v in ranking_set[(page - 1) * num_in_page:page * num_in_page]: if r.__contains__('score') is False and k == r['id']: r['score'] = v break result_list.sort( key=lambda r: r['score'], reverse=True if ranking != TIME_INCREASE_RANKING else False) return len(ranking_set), result_list, [ ' '.join(segment_word_4_search_list), regex_search_list ]