def get_conn(): host = config.get_config('host') user = config.get_config('user') password = config.get_config('password') database = config.get_config('database') return pymysql.connect(host=host, user=user, password=password, database=database, charset='utf8mb4')
def load_stop_words(): path = config.get_config('stop_word_path') with open(path, encoding='utf-8') as f: stop_words = f.readlines() for i in range(len(stop_words)): stop_words[i] = stop_words[i].strip() return stop_words
def load_dataset(text_field, label_field, args, **kwargs): train_dataset, dev_dataset = dataset.get_dataset('/home/ubuntu/user_space/lhw/public_opinion_monitoring/app/datas', text_field, label_field) vec_name = config.get_config('pretrained-name') vec_path = config.get_config('pretrained-path') if args.static and args.pretrained_name and args.pretrained_path: print('load word vector') vectors = load_word_vectors(vec_name, vec_path) text_field.build_vocab(train_dataset, dev_dataset, vectors=vectors) else: text_field.build_vocab(train_dataset, dev_dataset) label_field.build_vocab(train_dataset, dev_dataset) train_iter, dev_iter = data.Iterator.splits( (train_dataset, dev_dataset), batch_sizes=(args.batch_size, len(dev_dataset)), sort_key=lambda x: len(x.comment), **kwargs) return train_iter, dev_iter
def load_test_data(): path = config.get_config('test_data_path') data = pd.read_csv(path, encoding='utf-8') sample = data.sample(10).reset_index() # print(sample) questions = sample.get('question') answer = sample.get('answer') return questions, answer
def split_dataset(): path = config.get_config('data_path') data = pd.read_csv(path, encoding='utf-8') shuffle_data = data.sample(frac=1.0).reset_index() train_num = int(shuffle_data.shape[0] * 0.8) train_data = shuffle_data.loc[0:train_num].drop(labels='index', axis=1) test_data = shuffle_data.loc[train_num + 1:].drop(labels='index', axis=1) train_data.to_csv("../datas/train_data.csv", index=False) test_data.to_csv("../datas/test_data.csv", index=False) return
def load_data(): path = config.get_config('train_data_path') data = pd.read_csv(path, encoding='utf-8') questions = data.get('question') answer = data.get('answer') stop_words = load_stop_words() word_dict = defaultdict() for each in range(len(questions)): try: generate_index_dict(questions[each], each, stop_words, word_dict) except Exception: print(each) print(questions[each]) return word_dict, stop_words, answer, questions
def handle_corpus(): path = config.get_config('train_data_path') data = pd.read_csv(path, encoding='utf-8') questions = data.get('question') answer = data.get('answer') corpus = [] for each in range(len(questions)): try: words = jieba.lcut(questions[each] + ',' + answer[each]) # temp = [] # for word in words: # w = re.match('[\u4e00-\u9fa5]', word, False) # if w is not None: # temp.append(w.string) # if len(temp) <= 0: # continue sen = reduce(lambda x, y: x + ' ' + y, words) corpus.append(sen) except Exception: print(each) print(questions[each]) return corpus
def load_word_embedding_model(path=None): path = config.get_config('word_embedding_path') word_embedding = gensim.models.Word2Vec.load(path) # word_embedding = KeyedVectors.load_word2vec_format(path) return word_embedding
def load_word_embedding_model(): path = config.get_config('') word_embedding = gensim.models.Word2Vec.load(path) return word_embedding
def schedule_task(): interval = config.get_config("craw_interval") scheduler.enter(10, 0, craw_latest_comment, (int(interval), )) task = threading.Thread(target=scheduler.run) task.start()
general_service.plot_latest_chart() def craw_latest_comment(inc): scheduler.enter(inc, 0, craw_latest_comment, (inc, )) do_job() def schedule_task(): interval = config.get_config("craw_interval") scheduler.enter(10, 0, craw_latest_comment, (int(interval), )) task = threading.Thread(target=scheduler.run) task.start() path = config.get_config('word_embedding_path') word_vector_model = KeyedVectors.load_word2vec_format(path) def cosine(vec1, vec2): distance = pdist(np.vstack([vec1, vec2]), 'cosine')[0] return distance def extract_key_words(comment): comment = comment.replace("\n", "") comment = comment.replace("\r", "") words = jieba.lcut(comment) words = filter(lambda x: len(x) > 1, words) words = list(filter(lambda x: x not in stop_words, words)) if len(words) <= 0: