def process_from_json(file_path, nlp_model): """ 从json文件中读入数据 :param file_path: json file path :param nlp_model: :return: """ try: # mongo = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB, MONGODB_COLLECTION) es = esConnector(url=ES_URL, index=ES_INDEX, doc_type=ES_DOC_TYPE) with open(file_path, 'rb') as f: string = f.read() record = json.loads(string) document_model = documentExtraction(record, nlp_model) if not es.check_info_exist(document_model.title): logger.info('begin extract doc %s...' % document_model.title) document_info = document_model.extract_knowledge_from_record() if len(document_info.keys()): es.insert_single_info(document_info) else: logger.warn('extract document info failed ,skip es store') else: logger.info('doc %s exist in es, skip' % document_model.title) except Exception, e: logger.error( 'document extraction process from json file failed for %s' % str(e))
def _doc_info_analysis(self, doc_info): """ 分析doc_info,提取doc的属性 :param doc_info: :return:node_info node_type:节点的类型 notice,file id:es中id title:文件名 """ try: info = doc_info['_source'] # 存储的doc的类型 if len(info.get('parrent_file',[])): node_type = 'file' else: node_type = 'doc' node_id = doc_info['_id'] title = info.get('title', '') location = info.get('publish_location', '') return { 'node_name': 'notice', 'node_type': node_type, 'id': node_id, 'title': title, 'location': location } except Exception, e: logger.info('analysis doc info failed for %s' % str(e)) return None
def attention_layer_op(self): """ define attention layer :return: """ with tf.name_scope('attention'), tf.variable_scope('attention'): attention_w = tf.Variable(tf.truncated_normal([2 * self.hidden_size, self.attention_size], stddev=0.1), name='attention_w') attention_b = tf.Variable(tf.constant(0.1, shape=[self.attention_size]), name='attention_b') u_list = [] for t in list(range(self.sequence_length)): u_t = tf.tanh(tf.matmul(self.hidden_outputs[t], attention_w) + attention_b) u_list.append(u_t) u_w = tf.Variable(tf.truncated_normal([self.attention_size, 1], stddev=0.1), name='attention_uw') attn_z = [] for t in list(range(self.sequence_length)): z_t = tf.matmul(u_list[t], u_w) attn_z.append(z_t) # transform to batch_size * sequence_length attn_zconcat = tf.concat(attn_z, axis=1) self.alpha = tf.nn.softmax(attn_zconcat) # transform to sequence_length * batch_size * 1 , same rank as outputs alpha_trans = tf.reshape(tf.transpose(self.alpha, [1, 0]), [self.sequence_length, -1, 1]) self.attention_output = tf.reduce_sum(self.hidden_outputs * alpha_trans, 0) logger.info('attention layer output shape is %s' % self.attention_output.shape) with tf.name_scope("output"): # outputs shape: (sequence_length, batch_size, 2*rnn_size) W = tf.Variable(tf.truncated_normal([2 * self.hidden_size, self.num_classes], stddev=0.1), name='W') b = tf.Variable(tf.zeros([self.num_classes]), name='b') self.l2_loss += tf.nn.l2_loss(W) self.l2_loss += tf.nn.l2_loss(b) self.logits = tf.nn.xw_plus_b(self.attention_output, W, b, name="logits") self.prob = tf.nn.softmax(self.logits, name='prob') self.predictions = tf.argmax(self.prob, 1, name="predictions")
def run(self): """ """ logger.info('begin crawler..') try: self._run() except Exception, e: logger.error('star crawler failed for %s, stop crawler' % str(e)) sys.exit(1)
def cut_doc(self): """ 将文档进行分词处理 :return: """ logger.info(u'文档文本未分词,使用thunlp进行分词') # self.thunlp_model = thulac.thulac(seg_only=False, model_path=THUNLP_MODEL_PATH, \ # user_dict=THUNLP_USER_DIC_PATH) doc_seg = self.thunlp_model.cut(self.doc) # 保存原始分词结果,进行关键相邻词的短语组合 self.origin_doc_seg = doc_seg doc_seg_clear = self._clear_seg_list(doc_seg) return doc_seg_clear
def lstm(): # define forward cell with tf.name_scope('fw_rnn'), tf.variable_scope('fw_rnn'): logger.info(tf.get_variable_scope().name) lstm_fw_cell_list = [tf.contrib.rnn.LSTMCell(self.hidden_size) for _ in list(range(self.num_layer))] lstm_fw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(lstm_fw_cell_list), output_keep_prob=self.dropout_keep_prob) # define backward cell with tf.name_scope('bw_rnn'), tf.variable_scope('bw_rnn'): logger.info(tf.get_variable_scope().name) lstm_bw_cell_list = [tf.contrib.rnn.LSTMCell(self.hidden_size) for _ in list(range(self.num_layer))] lstm_bw_cell_m = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.MultiRNNCell(lstm_bw_cell_list), output_keep_prob=self.dropout_keep_prob) return lstm_fw_cell_m, lstm_bw_cell_m
def _add_summary(self, sess, vocab_processor): """ Tesorboard 图形化展示 :param sess: :return: """ # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in self.grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) logger.info("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", self.loss) acc_summary = tf.summary.scalar("accuracy", self.accuracy) # Train Summaries self.train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") self.train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries self.dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") self.dev_summary_writer = tf.summary.FileWriter( dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) self.checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab"))
def train_step(self, sess, x_batch, y_batch): """ A single training step """ feed_dict = { self.input_x: x_batch, self.input_y: y_batch, self.dropout_keep_prob: self.dropout } # scores, predictions = sess.run([self.scores, self.predictions], feed_dict) _, step, summaries, loss, accuracy = sess.run( [self.train_op, self.global_step, self.train_summary_op, self.loss, self.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() logger.info("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) self.train_summary_writer.add_summary(summaries, step)
def dev_step(self, sess, x_batch, y_batch): """ Evaluates model on a dev set """ feed_dict = { self.input_x: x_batch, self.input_y: y_batch, self.dropout_keep_prob: 1.0 } step, summaries, loss, accuracy = sess.run( [self.global_step, self.dev_summary_op, self.loss, self.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() logger.info("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) self.dev_summary_writer.add_summary(summaries, step)
def save_attachement_file(self, attachment_file_link, attachment_file_name): """ 保存附件文件 :param attachment_file_link: :return: """ try: response = self.get(attachment_file_link) with open(os.path.join(SAVING_PATH, attachment_file_name), 'wb') as f: logger.info('saving file %s' % attachment_file_name) f.write(response) except Exception, e: logger.error('saving attachment file failed for %s' % str(e))
def preprocess_for_data(): """ 将文本转换为模型输入的前处理流程 :return: """ try: train_sentences = load_sentence_file(FLAGS.train_file, FLAGS.zeros) dev_sentences = load_sentence_file(FLAGS.dev_file, FLAGS.zeros) test_sentences = load_sentence_file(FLAGS.test_file, FLAGS.zeros) # change tag schema in sentence trans_tag_schema(train_sentences, FLAGS.tag_schema) trans_tag_schema(test_sentences, FLAGS.tag_schema) # loading/writing mapping file if not os.path.isfile(FLAGS.map_file): logger.info('mapping file does not exist, create mapping file') if FLAGS.pre_emb: pass else: char_count_dic, id_to_char, char_to_id = char_mapping( train_sentences, FLAGS.lower) tag_count_dic, id_to_tag, tag_to_id = tag_mapping(train_sentences) with open(FLAGS.map_file, 'wb') as f: # notice pickle file format with py2 and py3 pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: pass logger.info('loading mapping file') with open(FLAGS.map_file, 'rb') as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare model data set # format data --- [[char_list, char_id_list, seg_id_list, tags_id_list],[]] # seg_id_list example: [X/XX/XXX/XXXX] -> [0 /1 3 /1 2 3 /1 2 2 3] train_data = prepare_model_data(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_model_data(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_model_data(test_sentences, char_to_id, tag_to_id, FLAGS.lower) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) return train_manager, dev_manager, test_manager except Exception, e: logger.error('pre-process for train string failed for %s' % str(e))
def save_notice_info(self, notice_info): """ :param notice_info: :return: """ try: if not self._check_info_exist(notice_info['noticeTitle']): logger.info('insert notice info...') self.mongo.collection.insert_one(notice_info) else: logger.info('update notice info...') self.mongo.collection.find_one_and_update( {'noticeTitle': notice_info['noticeTitle']}, {'$set': notice_info}) except Exception, e: logger.error('mongoDB store notice info failed for %s' % str(e))
def _create_doc_node(self, result_info): """ 建立图中的文档节点 :result_info: es中查询结果 :return: """ try: for doc_info in result_info: doc_analysis = self._doc_info_analysis(doc_info) if doc_analysis: if not self.neo4j_db.check_node_exist(doc_analysis): self.neo4j_db.create_doc_node(doc_analysis) logger.info('create node...') else: logger.info('node is existed, skip') else: logger.warn('analysis doc info failed ,skip...') except Exception, e: logger.error('create doc node failed for %s' %str(e))
def _create_node_relationship(self, result_info, rule_list): """ 根据规则建立节点间的链接关系 :param result_info: :return: """ try: for source_info in result_info: # begin match rules logger.info('extract file with id %s' % str(source_info.get('_id',''))) for rule in rule_list: is_match, relationship_type, relationship_info = rule(source_info) if is_match: logger.info('matching rule %s'%rule.__name__) self.neo4j_db.create_relation(relationship_type, relationship_info) else: pass except Exception, e: logger.error('extract relationship between nodes failed for %s' % str(e))
def __get_content_title(self): """ 统一取出content和title,需要多次使用 :return: """ try: if not self.file_name: self.title = self.record.get('noticeTitle', '') self.content = self.record.get('noticeContent', '') self.type = 'notice' else: # 去掉文件名中的空格,文件格式转换时做了空格的消除 self.file_name = self.__pre_deal_with_str(self.file_name) if len(self.file_name.split('.')) >= 2: self.title = self.file_name.split('.')[-2] else: self.title = self.file_name file_type = self.file_name.split('.')[-1] if file_type in ['xls', 'xlsx']: trans_file_type = 'csv' else: trans_file_type = 'txt' trans_file_name = self.file_name[:-1 * (len(file_type) + 1)] + '.' + trans_file_type if os.path.isfile(os.path.join(FILE_PATH, trans_file_name)): logger.info('reading file %s' % trans_file_name) with open(os.path.join(FILE_PATH, trans_file_name), 'r') as f: self.content = f.read() self.type = file_type else: logger.warn('file %s do not have trans file' % trans_file_name) self.content = '' self.type = '' except Exception, e: logger.error('get content and title string failed for %s' % str(e)) self.title = '' self.content = '' self.type = ''
def _save_data(self, info): """ 保存data info :param info: :return: """ try: if not self._check_info_exist(info['id'], info['year'], info['location']): logger.info('insert notice info...') self.mongo.collection.insert_one(info) else: logger.info('update notice info...') self.mongo.collection.find_one_and_update( { 'id': info['id'], 'year': info['year'], 'location': info['location'] }, {'$set': info}) except Exception, e: logger.error('mongoDB save data info failed for %s' % str(e))
def train(self, vocab_processor, x_train, y_train, x_dev, y_dev, pre_embeddings=None, checkpoint_file=None): """ model train process :return: """ saver = tf.train.Saver(tf.global_variables(), max_to_keep=self.num_checkpoints) # GPU assign tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = False with tf.Session(config=tf_config) as sess: sess.run(self.init_op) self._add_summary(sess, vocab_processor) # using pre trained embeddings if IS_PRETRAINED_EMBEDDING: sess.run(self._word_embeddings.assign(pre_embeddings)) del pre_embeddings # restore model if IS_MIDDLE_MODEL: saver.restore(sess, checkpoint_file.model_checkpoint_path) # Generate batches batches = batch_iter(list(zip(x_train, y_train)), self.batch_size, self.num_epochs) # Training loop. For each batch... for batch in batches: x_batch, y_batch = zip(*batch) self.train_step(sess, x_batch, y_batch) current_step = tf.train.global_step(sess, self.global_step) if current_step % self.evaluate_every == 0: logger.info("Evaluation:") self.dev_step(sess, x_dev, y_dev) logger.info("") if current_step % self.checkpoint_every == 0: path = saver.save(sess, self.checkpoint_prefix, global_step=current_step) logger.info("Saved model checkpoint to {}\n".format(path))
def train(self, vocab_processor, x_train, y_train, x_dev, y_dev): """ model train process :return: """ saver = tf.train.Saver(tf.global_variables(), max_to_keep=self.num_checkpoints) with tf.Session() as sess: sess.run(self.init_op) self._add_summary(sess, vocab_processor) # Generate batches batches = batch_iter(list(zip(x_train, y_train)), self.batch_size, self.num_epochs) # Training loop. For each batch... for batch in batches: x_batch, y_batch = zip(*batch) self.train_step(sess, x_batch, y_batch) current_step = tf.train.global_step(sess, self.global_step) if current_step % self.evaluate_every == 0: logger.info("Evaluation:") self.dev_step(sess, x_dev, y_dev) logger.info("") if current_step % self.checkpoint_every == 0: path = saver.save(sess, self.checkpoint_prefix, global_step=current_step) logger.info("Saved model checkpoint to {}\n".format(path))
def run(self): """ 主函数 :return: """ try: wds_info = [{"wdcode": "reg", "valuecode": ""}] dfwds_info = [{ "wdcode": "zb", "valuecode": "" }, { "wdcode": "sj", "valuecode": "LAST20" }] for needed_key in self.needed_info: dfwds_info[0]['valuecode'] = needed_key['id'] zb_key = needed_key['name'] needed_key_info_list = list() for _reg in self.reg_info: wds_info[0]['valuecode'] = _reg['code'] reg_key = _reg['name'] logger.info('analysis %s data info' % reg_key) self.post_data['wds'] = json.dumps(wds_info) self.post_data['dfwds'] = json.dumps(dfwds_info) response = requests.post(self.data_url, data=self.post_data) result = json.loads(response.content) data_list = self._analysis_table_data( result, zb_key, reg_key) data_reg_info = { 'location': _reg['name'], 'key': zb_key, 'data': data_list } needed_key_info_list.append(data_reg_info) self._save_json(needed_key_info_list, '../data/%s.json' % zb_key) except Exception, e: logger.error('crawler main process failed for %s' % str(e))
def _create_entity_node(self, result_info): """ 建立图中的 :param result_info: :return: """ try: entity_cache_list = list() for doc_info in result_info: info = doc_info['_source'] entity_name = info.get('entity_name', []) entity_org = info.get('entity_org', []) entity_loc = info.get('entity_loc', []) for seg in entity_name: if seg not in entity_cache_list: entity_info = { 'entity_type': 'name', 'seg': seg } self.neo4j_db.create_entity_node(entity_info) logger.info('create name entity node of %s' % seg) entity_cache_list.append(seg) else: continue for seg in entity_org: if seg not in entity_cache_list: entity_info = { 'entity_type': 'org', 'seg': seg } self.neo4j_db.create_entity_node(entity_info) logger.info('create organization entity node of %s' % seg) entity_cache_list.append(seg) else: continue for seg in entity_loc: if seg not in entity_cache_list: entity_info = { 'entity_type': 'loc', 'seg': seg } self.neo4j_db.create_entity_node(entity_info) logger.info('create location entity node of %s' % seg) entity_cache_list.append(seg) else: continue except Exception, e: logger.error('create entity node failed for %s' %str(e))
def train(): """ model train process :return: """ logger.info('Loading train data...') # Load train data x_text, y = load_data_and_labels(TRAIN_DATA_PATH_POS, TRAIN_DATA_PATH_NEG) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) vocab_size = len(vocab_processor.vocabulary_) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] del x, y, x_shuffled, y_shuffled logger.info("Vocabulary Size: {:d}".format(len( vocab_processor.vocabulary_))) logger.info("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) model = TextCNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], dropout=dropout, vocab_size=vocab_size, embedding_dim=embedding_dim, filter_sizes=filter_sizes, num_filters=num_filters, l2_reg_lambda=l2_reg_lambda, optimizer=optimizer, lr=lr, grad_clip=grad_clip, num_checkpoints=num_checkpoints, batch_size=batch_size, num_epochs=num_epochs, evaluate_every=evaluate_every, checkpoint_every=checkpoint_every) model.build_graph() model.train(vocab_processor, x_train, y_train, x_dev, y_dev)
def train(): """ 训练模块 :return: """ try: # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True train_manager, dev_manager, test_manager = preprocess_for_data() logger.info('loading mapping file') with open(FLAGS.map_file, 'rb') as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = build_config(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) # steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = initial_ner_model(sess, NER_MODEL, FLAGS.ckpt_path, load_word2vec, config, id_to_char) logger.info("start training NER model") loss = [] # epoch iterate for i in range(FLAGS.max_epoch): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] save_model(sess, model, FLAGS.ckpt_path) # evaluate result for stop epoch iter # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) # if best: # save_model(sess, model, FLAGS.ckpt_path, logger) # evaluate(sess, model, "test", test_manager, id_to_tag, logger) except Exception, e: logger.error('training model process failed for %s' % str(e))
def search_link_info(self, notice_link): """ 通过公告链接获取全文,下载附件 :param notice_link: :return: """ try: if notice_link.startswith('http'): pass else: notice_link = self.title_base_url + notice_link[1:] # generate for attachment file url notice_baseurl = notice_link[0:(len(notice_link.split('/')[-1]) + 1) * -1] response = self.get(notice_link) notice_soup = BeautifulSoup(response, 'html5lib') title_tag = notice_soup.find('td', attrs={'class': 'font_biao1'}) main_tag = notice_soup.find('div', attrs={'class': 'TRS_Editor'}) attachment_tag = notice_soup.find('span', attrs={'id': 'appendix'}) title = self._get_tag_string(title_tag).strip() # debug 2018-9-12 # file name without space title = title.replace(' ', '') # if self._check_info_exist(title): # return None, True logger.info('notice title is %s' % title) # notice doc search doc_tag_list = main_tag.find_all('p') doc_content = '' doc_identify = '' doc_attachment = '' # 原始网站中的公告内容使用p tag进行换行,所以在存入content的时候需要加入换行符 # 2018-9-4 cc for doc_tag in doc_tag_list: if doc_tag.attrs.get('align') == 'center': doc_content += self._get_tag_string(doc_tag) + '\n' doc_identify += self._get_tag_string(doc_tag).strip() # elif doc_tag.attrs.get('align') == 'justify': # doc_content += self._get_tag_string(doc_tag) elif doc_tag.attrs.get('align') == 'right': doc_content += self._get_tag_string(doc_tag) + '\n' doc_attachment += self._get_tag_string( doc_tag).strip() + '\n' else: doc_content += self._get_tag_string(doc_tag) + '\n' # attachment file search attachment_file_list = attachment_tag.find_all('a') attachment_file_name_list = list() attachment_file_link_list = list() # 部分文件的后缀名不在附件名中出现需要从链接中取出后缀名 # 2018-9-6 debug for attachment_file_tag in attachment_file_list: attachment_file_name = '' _attachment_link = attachment_file_tag.attrs.get('href') try: file_type = _attachment_link.split('.')[-1] except: logger.warn('search file type failed') file_type = '' _attachment_file_name = self._get_tag_string( attachment_file_tag).strip() if ':' in _attachment_file_name: attachment_file_name = _attachment_file_name.split(':')[-1] elif ':' in _attachment_file_name: attachment_file_name = _attachment_file_name.split(':')[-1] else: attachment_file_name = _attachment_file_name # add file attachment type try: attachment_file_type = attachment_file_name.split('.')[-1] except: attachment_file_type = '' if attachment_file_type not in ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip']\ and file_type != '': attachment_file_name = attachment_file_name + '.' + file_type # _attachment_link format './P020180828399303596996.pdf' attachment_file_link = notice_baseurl + _attachment_link[1:] # saving file self.save_attachement_file(attachment_file_link, attachment_file_name) attachment_file_name_list.append(attachment_file_name) attachment_file_link_list.append(attachment_file_link) return { 'noticeTitle': title, 'noticeContent': doc_content, 'noticeIdentify': doc_identify, 'noticeAttachment': doc_attachment, 'noticeLink': notice_link, 'attachmentFileList': attachment_file_name_list, 'attachmentLinkList': attachment_file_link_list, 'category': self.category, 'filePath': SAVING_PATH, 'location': self.location }, False except Exception, e: logger.error('searching link info failed for %s' % str(e)) return None, False
def trans_file_from_db(trans_path): """ :return: """ try: mongo_db = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB, MONGODB_COLLECTION) # 进入文件存储目录 # os.system('cd %s' % SAVING_PATH) path_command = 'cd %s &&' % SAVING_PATH failed_list = list() for record in mongo_db.collection.find(): file_list = record.get('attachmentFileList', []) for file_name in file_list: logger.info('begin to trans file %s' % file_name) # file name has space string , failed with shell command # remind with mongoDB attachment file list link if ' ' in file_name: logger.info('file name has space string, trans file name') os.system(path_command + "mv '%s' %s" % (file_name, file_name.replace(' ', ''))) file_name = file_name.replace(' ', '') base_name = file_name[:(len(file_name.split('.')[-1]) + 1) * -1] if file_name.endswith('.doc') or file_name.endswith('.docx'): os.system(path_command + 'unoconv -f txt %s' % file_name) os.system(path_command + 'mv %s.txt %s' % (base_name, trans_path)) elif file_name.endswith('.xls') or file_name.endswith('.xlsx'): os.system(path_command + 'unoconv -f csv %s' % file_name) os.system(path_command + 'mv %s.csv %s' % (base_name, trans_path)) elif file_name.endswith('.pdf'): os.system(path_command + 'pdftotext -nopgbrk %s %s/%s.txt' % (file_name, trans_path, base_name)) # 压缩文件类型不齐全 # 目前包括 rar zip gz elif file_name.endswith('.rar') or file_name.endswith( '.zip') or file_name.endswith('.gz'): pass else: logger.warn( 'file type is not recognized; file name is %s' % file_name) # trying trans doc/docx file logger.info('trying trans file with unconv txt') result = os.system(path_command + 'unoconv -f txt %s' % file_name) if not result: os.system(path_command + 'mv %s.txt %s' % (base_name, trans_path)) continue else: logger.warn('trans file with unconv txt failed') # trying trans xls/xlsx file logger.info('trying trans file with unconv csv') result = os.system(path_command + 'unoconv -f csv %s' % file_name) if not result: os.system(path_command + 'mv %s.csv %s' % (base_name, trans_path)) continue else: logger.warn('trans file with unconv csv failed') # trying trans pdf file logger.info('trying trans file with pdftotext') result = os.system(path_command + 'pdftotext -pgnobrk %s %s/%s.txt' % (file_name, trans_path, base_name)) if not result: continue else: logger.warn('trans file with pdftotext failed') failed_list.append(file_name) # 打印无法转换的文件名称 for file_name in failed_list: print file_name except Exception, e: logger.error('file trans failed for %s' % str(e))
# coding=utf-8 """ @ license: Apache Licence @ github: invoker4zoo @ author: invoker/cc @ wechart: whatshowlove @ software: PyCharm @ file: web_server.py @ time: $18-9-25 下午6:25 """ import tornado.ioloop import tornado.web from handler import * from tool.logger import logger HandlerList = [ (r"/main", MainSearchHandler), (r"/search/query", QuerySearchHandler), (r"/search/id", IdSearchHandler), ] if __name__ == '__main__': application = tornado.web.Application(HandlerList) serverPort = 8080 application.listen(serverPort) logger.info('server start at port %d' % serverPort) tornado.ioloop.IOLoop.instance().start()
# coding=utf-8 """ @ license: Apache Licence @ github: invoker4zoo @ author: invoker/cc @ wechart: whatshowlove @ software: PyCharm @ file: process.py @ time: $18-9-25 下午4:34 """ import sys sys.path.append('..') from tool.logger import logger from config.config import * import thulac from document_extraction import main_process from knowledge_extraction_sample import buildGraph if __name__ == '__main__': logger.info('loading nlp model') # thunlp_model = thulac.thulac(seg_only=False, model_path=THUNLP_MODEL_PATH, \ # user_dict=THUNLP_USER_DIC_PATH) # logger.info('begin document extraction...') # main_process(thunlp_model) logger.info('begin knowledge extraction...') process = buildGraph() process.initial()
def test(): """ model test process :return: """ logger.info('Loading test data...') # Load test data x_text, y = load_data_and_labels(TEST_DATA_PATH_POS, TEST_DATA_PATH_NEG) # Load vocabulary vocab_processor = learn.preprocessing.VocabularyProcessor.restore( '../process/runs/1548669564/vocab') x_test = np.array(list(vocab_processor.transform(x_text))) y_test = y logger.info('test data: {}'.format(len(x_test))) # Load train model # ckpt_file = tf.train.latest_checkpoint('..\\process\\runs\\1548399694\\checkpoints\\') # logger.info('model path is %s' % ckpt_file) # testing graph = tf.Graph() with graph.as_default(): tf_config = tf.ConfigProto() # Misc Parameters tf_config.allow_soft_placement = True tf_config.log_device_placement = False with tf.Session(config=tf_config) as sess: # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph("{}.meta".format( "..\\process\\runs\\1548669564\\checkpoints\\model-4100")) saver.restore( sess, "..\\process\\runs\\1548669564\\checkpoints\\model-4100") # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/predictions").outputs[0] # Generate batches for one epoch batches = batch_iter(list(x_test), batch_size, 1, shuffle=False) # Collect the predictions here all_predictions = [] for x_test_batch in batches: batch_predictions = sess.run(predictions, { input_x: x_test_batch, dropout_keep_prob: 1.0 }) all_predictions = np.concatenate( [all_predictions, batch_predictions]) # Print accuracy if y_test is defined if y_test is not None: correct_predictions = float( sum(np.argmax(y_test, 1) == all_predictions)) logger.info("Total number of test examples: {}".format(len(y_test))) logger.info("Accuracy: {:g}".format(correct_predictions / float(len(y_test)))) # Save the test result to a csv predictions_human_readable = np.column_stack( (np.array(x_text), all_predictions)) output_path = "../data/test_data/prediction.csv" logger.info("Saving evaluation to {0}".format(output_path)) with open(output_path, 'w') as f: csv.writer(f).writerows(predictions_human_readable)
def train(): """ model train process :return: """ logger.info('Loading train data...') # load train data x_text, y = load_data_and_labels(TRAIN_DATA_PATH) # build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) vocab_size = len(vocab_processor.vocabulary_) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] del x, y, x_shuffled, y_shuffled logger.info("Vocabulary Size: {:d}".format(vocab_size)) logger.info("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) # define pretrained embedding embedding_dim = EMBEDDING_DIM embeddings = None if IS_PRETRAINED_EMBEDDING: embeddings = np.load(PRETRAINED_EMBEDDING_PATH) logger.info("embedding shape {}".format(embeddings.shape)) vocab_size = embeddings.shape[0] embedding_dim = embeddings.shape[1] # load saved middle model last time ckpt = None if IS_MIDDLE_MODEL: assert os.path.isdir(MIDDLE_PATH), '{} must be a directory'.format(MIDDLE_PATH) ckpt = tf.train.get_checkpoint_state(MIDDLE_PATH) assert ckpt, 'No checkpoint found' assert ckpt.model_checkpoint_path, 'No model path found in checkpoint' model = BiRNN(embedding_dim=embedding_dim, hidden_size=hidden_size, num_layer=num_layer, vocab_size=vocab_size, attention_size=attention_size, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], grad_clip=grad_clip, lr=lr, l2_reg_lambda=l2_reg_lambda, dropout=dropout, optimizer=optimizer, num_checkpoints=num_checkpoints, batch_size=batch_size, num_epochs=num_epochs, evaluate_every=evaluate_every, checkpoint_every=checkpoint_every) model.build_graph() model.train(vocab_processor, x_train, y_train, x_dev, y_dev, pre_embeddings=embeddings, checkpoint_file=ckpt)
def main_process(nlp_model): """ main function :return: """ try: mongo = dbConnector(MONGODB_SERVER, MONGODB_PORT, MONGODB_DB, MONGODB_COLLECTION) es = esConnector(url=ES_URL, index=ES_INDEX, doc_type=ES_DOC_TYPE) cursor = mongo.collection.find(no_cursor_timeout=True) for record in cursor: # for record in mongo.collection.find().batch_size(1): if not len(record.get('attachmentFileList', [])): document_model = documentExtraction(record, nlp_model) if not es.check_info_exist(document_model.title): logger.info('begin extract doc %s...' % document_model.title) document_info = document_model.extract_knowledge_from_record( ) if len(document_info.keys()): es.insert_single_info(document_info) else: logger.warn( 'extract document info failed ,skip es store') else: logger.info('doc %s exist in es, skip' % document_model.title) else: document_model = documentExtraction(record, nlp_model) if not es.check_info_exist(document_model.title): logger.info('begin extract doc %s...' % document_model.title) document_info = document_model.extract_knowledge_from_record( ) if len(document_info.keys()): es.insert_single_info(document_info) else: logger.warn( 'extract document info failed ,skip es store') else: logger.info('doc %s exist in es, skip' % document_model.title) for file_name in record.get('attachmentFileList', []): document_model = documentExtraction(record, nlp_model, file_name=file_name) if not es.check_info_exist(document_model.title): logger.info('begin extract doc %s...' % document_model.title) document_info = document_model.extract_knowledge_from_record( ) if len(document_info.keys()): es.insert_single_info(document_info) else: logger.warn( 'extract document info failed ,skip es store') else: logger.info('doc %s exist in es, skip' % document_model.title) cursor.close() except Exception, e: logger.error('document extract failed for %s' % str(e))
def _run(self): """ 启动爬虫主函数 :return: """ self.notice_link_list = list() self.title_base_url = self.base_url + '/' + self.category for page in range(0, self.page): if page == 0: url = self.title_base_url + '/' + 'index.htm' else: url = self.title_base_url + '/' + 'index_%d.htm' % page logger.info('searching gov finance notice link on page %d' % (page + 1)) response = self.get(url) page_soup = BeautifulSoup(response, 'html5lib') # debug 2018-9-5 # 财经视点栏目的tag class名字与其他栏目的tag class不一致 if self.category == 'caijingshidian': notice_tag_list = page_soup.find_all('td', attrs={'class': 'xiaxu'}) else: notice_tag_list = page_soup.find_all('td', attrs={'class': 'ZITI'}) for notice_tag in notice_tag_list: title = notice_tag.attrs.get('title') time_str = self._search_time_from_title(title) logger.info('notice publish time is %s' % time_str) if title: pass else: logger.warning('searching notice title failed') continue notice_info_tag = notice_tag.find('a') link = notice_info_tag.attrs.get('href') if link: logger.info('searching notice info for %s' % title) self.notice_link_list.append(link) link_info, is_exist = self.search_link_info(link) if link_info and not is_exist: link_info['publishTime'] = time_str self.save_notice_info(link_info) elif is_exist: link_info['publishTime'] = time_str self.save_notice_info(link_info) logger.info('link info is existed') continue else: logger.warn('searching link info failed') else: logger.warning('get notice link failed for %s' % title) # 间隔5秒 logger.info('crawler sleeping for 5s...') time.sleep(5) # 间隔2秒 logger.info('crawler sleeping for 2s...') time.sleep(2)