import HTMLParser #处理html编码字符 import re import json from utils import LoggerUtil from utils.SqlUtil import MongoDB import pymongo import urlparse from bs4 import BeautifulSoup #lxml解析器 from cfg import subjects, xds, PATH, URL, COLL import sys reload(sys) sys.setdefaultencoding('utf-8') html_parser = HTMLParser.HTMLParser() logger = LoggerUtil.getLogger(__name__) logger_major = LoggerUtil.getLogger('major') class PaperParse: '''分析组卷网的试卷页面''' def __init__(self, url=URL.rootUrl): self.session = requests.Session() self.session.get(url) def parseParperPropAll(self, url=URL.paper_url): '''分析试卷的所有公共属性''' mongo = MongoDB() #创建唯一索引 # for key, value in COLL.type.items(): # coll = mongo.getCollection(value)
#!/usr/bin/python #-*-coding:utf-8-*- import requests import os import urlparse import json from utils.SqlUtil import PostgreSql from utils import LoggerUtil, Utils import re import sys reload(sys) sys.setdefaultencoding('utf-8') logger = LoggerUtil.getLogger(__name__) SELECT_SQL = 'SELECT qid,answer FROM T_QUES_ZUJUAN_EX WHERE cate=1 AND subject= %s AND qid > %s ORDER BY seq ASC LIMIT %s ' UPDATE_SQL = 'UPDATE T_QUES_ZUJUAN_EX SET choice_answer = %s WHERE qid = %s' UPDATE_STATUS_SQL = 'UPDATE T_QUES_ZUJUAN_EX SET status = %s WHERE qid = %s' ROWS = 1000 rootImagPath = '/data/meiqiming/data/zj_image_new' OPTIONS = {'A': [], 'B': [], 'C': [], 'D': [], 'E': []} def init(): for parent, dir_names, file_names in os.walk('data'): for file_name in file_names: for key, values in OPTIONS.items(): if file_name.startswith(key): values.append(