Esempio n. 1
0
 def get(self, topic): 
     logging.info(topic)
     topics = utils.get_topics()
     data = utils.get_data_dict(topic)
     print data
     params = {
         "topic": topic,
         "topics": topics,
         "data": data,
     }
     self.render('topic.html',**params)
Esempio n. 2
0
def create_dataset():
    doc_dic = get_topics()
    data_dir = 'G:\\news_track\\'
    gains_file = data_dir + 'bqrels.exp-gains.txt'
    ids_file = data_dir + 'ids.csv'
    dst_file_path = data_dir + 'data.txt'
    es = Elasticsearch()
    # es = Elasticsearch('222.20.24.230:9200')
    ids = set()
    map_file = 'map.txt'

    with open(gains_file) as gf, open(dst_file_path, 'a',
                                      encoding='utf-8') as df, open(
                                          ids_file, 'r') as idf:
        gf_reader = csv.reader(gf, delimiter=" ")
        ids_reader = csv.reader(idf)

        ids = next(ids_reader)
        lines = []
        id2s = set()
        for line in gf_reader:
            id1 = doc_dic[int(line[0])]
            id2 = line[2]
            df.write(','.join([id1, id2, line[-1]]) + '\n')
            df.write(','.join([id2, id1, line[-1]]) + '\n')
            id2s.add(id2)
            #每个topic 增加同样多的不相关数据
            id3 = random.choice(ids)
            if id3 not in id2s:
                df.write(','.join([id1, id3, '-1']) + '\n')
                df.write(','.join([id3, id1, '-1']) + '\n')
        '''
        随机选取两个id  相关度-1
        保存到ids 和 articles
        '''
        for i in range(10000):
            id = random.sample(ids, 2)
            id1 = id[0]
            id2 = id[1]
            df.write(','.join([id1, id2, '-1']) + '\n')
            df.write(','.join([id2, id1, '-1']) + '\n')
Esempio n. 3
0
def process_article(in_queue, out_queue):
    document_id = ""
    logger.info("Processing of news feed started!!!")
    lda_model = gensim.models.ldamodel.LdaModel.load(model_path)
    stanford_tagger = StanfordNERTagger(
        config_obj.stanford_ner_classifier_path,
        config_obj.stanford_ner_jar_path)
    while 1:
        try:
            document = in_queue.get(True, queue_timeout)
            document_id, document_content = document.get('id'), document.get(
                'content')
            if document_id:
                logger.info("Processing document id - {}".format(document_id))
                node = Document.nodes.get_or_none(documentId=document_id)
                if node:
                    logger.info(
                        "Node with id - {} already exists in graph. Skipping".
                        format(document_id))
                    continue
                else:
                    cleaned_text = basic_cleanup(document_content)
                    topics = get_topics(cleaned_text, lda_model, nlp)
                    entities = get_entities_from_nltk(cleaned_text,
                                                      stanford_tagger)

                    payload = {
                        "document_id": document_id,
                        "topics": topics,
                        "entities": entities
                    }
                    out_queue.put(payload, block=True)
            else:
                continue
        except queue.Empty as e:
            logger.warning("Queue is empty. Quitting!!!")
            break
        except Exception as e:
            logger.exception("Exception in processing %s, Skipping !!!!" %
                             document_id)
    logger.info("Work done. Quitting")
Esempio n. 4
0
def topics():
    user = request.args.get('user', '')
    topics = get_topics(user)
    return topics
Esempio n. 5
0
File: views.py Progetto: SwoJa/ruman
def topics():
    user = request.args.get('user','')
    topics = get_topics(user)
    return topics
Esempio n. 6
0
from utils import get_data, compute_all_representation, transform_for_topics, transform_for_naive, transform_for_bag_of_words, transform_for_tfidf, transform_for_word_embeddings, transform_for_sentence_embeddings, create_labels, visualize_tsne, word_cloud
from utils import get_topics
from utils_ML import *
from sklearn import linear_model, ensemble, svm, neural_network, naive_bayes, tree
from CNN_LSTM_models import CNN_LSTM_Wrapper
import os
import time

if __name__ == '__main__':
    data = get_data()  # The data are already shuffled
    data, word_to_index, word_to_index_we, index_we_to_emb = compute_all_representation(
        data)
    emb_matrix = [[
        float(x) for x in (v.split() if type(v) == 'str' else v)
    ] for _, v in sorted(index_we_to_emb.items(), key=lambda x: x[0])]
    topics, data = get_topics(data)
    '''
    word_cloud({i:x for i, x in enumerate(topics)})
    #'''
    '''
    word_to_index_we -> mapping from a lowered token into an index to be mapped for word embeddings
    index_we_to_emb -> mapping from an vocabulary index to word embeddings (300 dim)
    word_to_index -> as word_to_index but for naive, bag of words and tfidf representations
    
    data -> list of samples being the whole dataset.
    Each sample is a dictionnary with the following attributes:
        - id: Line# from spam.csv
        - text: raw text from spam.csv
        - type: ham/spam from spam.csv
        - tokens: tokens obtained via tokenization. They are not lowered
        - lemmas: lemmas obtained via lemmazitazion. They are lowered
Esempio n. 7
0
from elasticsearch import Elasticsearch
from RAKE.rake import Rake

from utils import get_topics

# host_ip = '222.20.25.124'
# es = Elasticsearch('222.20.25.124:9200')
es = Elasticsearch()

topics = get_topics()
rake = Rake()
index = 'news_track'


def use_Rake():
    answer_file = 'answer_rake.txt'
    # get topics as a dic
    for num, search_id in topics.items():
        # build query dsl
        dsl = {'query': {'match': {}}, 'size': 200}
        answer = {}
        # source article
        article = es.get(index=index, id=search_id)['_source']
        title = article['title']
        contents = article['contents']

        # keyword extraction using Rake
        key_words = rake.run(contents)
        # get top-10 keywords
        if len(key_words) > 10:
            key_words = key_words[:10]