def get(self, topic): logging.info(topic) topics = utils.get_topics() data = utils.get_data_dict(topic) print data params = { "topic": topic, "topics": topics, "data": data, } self.render('topic.html',**params)
def create_dataset(): doc_dic = get_topics() data_dir = 'G:\\news_track\\' gains_file = data_dir + 'bqrels.exp-gains.txt' ids_file = data_dir + 'ids.csv' dst_file_path = data_dir + 'data.txt' es = Elasticsearch() # es = Elasticsearch('222.20.24.230:9200') ids = set() map_file = 'map.txt' with open(gains_file) as gf, open(dst_file_path, 'a', encoding='utf-8') as df, open( ids_file, 'r') as idf: gf_reader = csv.reader(gf, delimiter=" ") ids_reader = csv.reader(idf) ids = next(ids_reader) lines = [] id2s = set() for line in gf_reader: id1 = doc_dic[int(line[0])] id2 = line[2] df.write(','.join([id1, id2, line[-1]]) + '\n') df.write(','.join([id2, id1, line[-1]]) + '\n') id2s.add(id2) #每个topic 增加同样多的不相关数据 id3 = random.choice(ids) if id3 not in id2s: df.write(','.join([id1, id3, '-1']) + '\n') df.write(','.join([id3, id1, '-1']) + '\n') ''' 随机选取两个id 相关度-1 保存到ids 和 articles ''' for i in range(10000): id = random.sample(ids, 2) id1 = id[0] id2 = id[1] df.write(','.join([id1, id2, '-1']) + '\n') df.write(','.join([id2, id1, '-1']) + '\n')
def process_article(in_queue, out_queue): document_id = "" logger.info("Processing of news feed started!!!") lda_model = gensim.models.ldamodel.LdaModel.load(model_path) stanford_tagger = StanfordNERTagger( config_obj.stanford_ner_classifier_path, config_obj.stanford_ner_jar_path) while 1: try: document = in_queue.get(True, queue_timeout) document_id, document_content = document.get('id'), document.get( 'content') if document_id: logger.info("Processing document id - {}".format(document_id)) node = Document.nodes.get_or_none(documentId=document_id) if node: logger.info( "Node with id - {} already exists in graph. Skipping". format(document_id)) continue else: cleaned_text = basic_cleanup(document_content) topics = get_topics(cleaned_text, lda_model, nlp) entities = get_entities_from_nltk(cleaned_text, stanford_tagger) payload = { "document_id": document_id, "topics": topics, "entities": entities } out_queue.put(payload, block=True) else: continue except queue.Empty as e: logger.warning("Queue is empty. Quitting!!!") break except Exception as e: logger.exception("Exception in processing %s, Skipping !!!!" % document_id) logger.info("Work done. Quitting")
def topics(): user = request.args.get('user', '') topics = get_topics(user) return topics
def topics(): user = request.args.get('user','') topics = get_topics(user) return topics
from utils import get_data, compute_all_representation, transform_for_topics, transform_for_naive, transform_for_bag_of_words, transform_for_tfidf, transform_for_word_embeddings, transform_for_sentence_embeddings, create_labels, visualize_tsne, word_cloud from utils import get_topics from utils_ML import * from sklearn import linear_model, ensemble, svm, neural_network, naive_bayes, tree from CNN_LSTM_models import CNN_LSTM_Wrapper import os import time if __name__ == '__main__': data = get_data() # The data are already shuffled data, word_to_index, word_to_index_we, index_we_to_emb = compute_all_representation( data) emb_matrix = [[ float(x) for x in (v.split() if type(v) == 'str' else v) ] for _, v in sorted(index_we_to_emb.items(), key=lambda x: x[0])] topics, data = get_topics(data) ''' word_cloud({i:x for i, x in enumerate(topics)}) #''' ''' word_to_index_we -> mapping from a lowered token into an index to be mapped for word embeddings index_we_to_emb -> mapping from an vocabulary index to word embeddings (300 dim) word_to_index -> as word_to_index but for naive, bag of words and tfidf representations data -> list of samples being the whole dataset. Each sample is a dictionnary with the following attributes: - id: Line# from spam.csv - text: raw text from spam.csv - type: ham/spam from spam.csv - tokens: tokens obtained via tokenization. They are not lowered - lemmas: lemmas obtained via lemmazitazion. They are lowered
from elasticsearch import Elasticsearch from RAKE.rake import Rake from utils import get_topics # host_ip = '222.20.25.124' # es = Elasticsearch('222.20.25.124:9200') es = Elasticsearch() topics = get_topics() rake = Rake() index = 'news_track' def use_Rake(): answer_file = 'answer_rake.txt' # get topics as a dic for num, search_id in topics.items(): # build query dsl dsl = {'query': {'match': {}}, 'size': 200} answer = {} # source article article = es.get(index=index, id=search_id)['_source'] title = article['title'] contents = article['contents'] # keyword extraction using Rake key_words = rake.run(contents) # get top-10 keywords if len(key_words) > 10: key_words = key_words[:10]