def init_process(row): uid = str(uuid.uuid3(uuid.NAMESPACE_OID, "Process/" + row[0])) cat_id = get_category_id("PROCESS", row[4], row[3]) flow_id = str(uuid.uuid3(uuid.NAMESPACE_OID, "Flow/" + row[0])) p = { "@context": "http://greendelta.github.io/olca-schema/context.jsonld", "@type": "Process", "@id": uid, "name": row[2], "processTyp": "UNIT_PROCESS", "category": {"@type": "Category", "@id": cat_id}, "processDocumentation": {"copyright": False}, "exchanges": [ { "@type": "Exchange", "avoidedProduct": False, "input": False, "amount": 1.0, "flow": {"@type": "Flow", "@id": flow_id}, "unit": { "@type": "Unit", "@id": "3f90ee51-c78b-4b15-a693-e7f320c1e894" }, "flowProperty": { "@type": "FlowProperty", "@id": "b0682037-e878-4be4-a63a-a7a81053a691" }, "quantitativeReference": True } ] } return p
def setup_app(command, conf, vars): """Place any commands to setup repository here""" # Don't reload the app if it was loaded under the testing environment if not pylons.test.pylonsapp: load_environment(conf.global_conf, conf.local_conf) # Create the tables if they don't already exist Base.metadata.create_all(bind=Session.bind) namespace = uuid.UUID(conf.global_conf['uuid_namespace']) # Default groups users = model.Group(name='users') users.uuid = uuid.uuid3(namespace, 'GROUP'+'users').hex Session.add(users) Session.commit() # add some users from a file into the db for testing # each line of the file should be of the form name,email,dn admin_file = conf.global_conf['admin_file'] f = open(path.expandvars(admin_file), 'r') for line in f: name, email, dn = line.rstrip('\n').split(',') user = model.User(name=name, email=email, client_dn=dn) user.uuid = uuid.uuid3(namespace, dn).hex user.gobal_admin=True user.suspended=False user.groups.append(users) Session.add(user) Session.commit() f.close()
def migrate_standardpage_intro_and_body_to_streamfield(apps, schema_editor): StandardPage = apps.get_model('torchbox.StandardPage') stream_block = StandardPage._meta.get_field('streamfield').stream_block # Append body to beginning of streamfield for page in StandardPage.objects.exclude(body__in=['', '<p></p>', '<p><br/></p>']): # Add body as first block so it appears in the same place on the template page.streamfield = StreamValue( stream_block, [ ('paragraph', RichText(page.body), str(uuid3(UUID_NAMESPACE, page.body))), ] + [ (child.block_type, child.value, child.id) for child in page.streamfield ] ) page.save() # Append intro to beginning of streamfield for page in StandardPage.objects.exclude(intro__in=['', '<p></p>', '<p><br/></p>']): # Add intro as first block so it appears in the same place on the template page.streamfield = StreamValue( stream_block, [ ('paragraph', RichText(page.intro), str(uuid3(UUID_NAMESPACE, page.intro))), ] + [ (child.block_type, child.value, child.id) for child in page.streamfield ] ) page.save()
def initialize(self): root = {"role_name": "root"} admin = {"username": "******", "password": "******", "status": 1, "email": "*****@*****.**", "role_code": str(uuid.uuid3(uuid.NAMESPACE_DNS, "root"))} roles = AlchemyWrapper("roles") users = AlchemyWrapper("users") node = AlchemyWrapper("resource") if len(roles.all(**root))==0: roles.insert(root) if len(users.all(username="******"))==0: users.insert(admin) attribute = getattr(options, "attribute", "scarecrow") for api in self.api_list.get("api"): combinat = api.get("url") + attribute node_code = str(uuid.uuid3(uuid.NAMESPACE_DNS, str(combinat))) node_info = {"attribute": attribute, "code": node_code, "resource_name": api.get("name"), "resource_URI": api.get("url")} if len(node.all(**node_info))==0: node.insert(node_info)
def parse(self,response): global uuid #生成uuid的namespace,和当前链接相关 namespace = uuid.uuid3(uuid.NAMESPACE_URL,response.url) item = Baike_Qijia_Item() split1 = '>' split2 = ';' item['title_id'] = response.url.split("-")[1][:-1] #取url中的id item['title_url'] = response.url item['title_name'] = Selector(response).xpath("//div[@class='artical-des atical-des1 fl']/h1/text()").extract() item['title_introduction'] = Selector(response).xpath("//div[@class='artical-des atical-des1 fl']/div[1]/i/text()").extract() #分类用>连接成为一个字符串 category = Selector(response).xpath("//div[@class='bk-nav clearfix']/a/text()").extract() item['title_category'] = split1.join(category) item['content_name'] = Selector(response).xpath("//div[@class='atical-floor']/div/h2/a/text()").extract() #内容主键生成,标题表uuid_list生成 content_uuid = [] index = 1 while index <= len(item['content_name']): con_id = uuid.uuid3(namespace,'%d'%index) content_uuid.append(con_id.hex) index = index + 1 item['content_uuid'] = content_uuid item['content_uuid_list'] = split2.join(content_uuid) #content_text得处理text里面的html标签 item['content_text'] = Selector(response).xpath("//div[@class='atical-floor']/div/div/div/p").extract() item['image_urls'] = Selector(response).xpath("//div[@class='floor-content floor-content-ml clearfix']/div/img/@src").extract() return item
def update_revisions(page, content): streamfield_json = content.get('streamfield', '') if streamfield_json: streamfield = json.loads(streamfield_json) else: streamfield = [] # Append body to beginning of streamfield if content['body'] not in ['', '<p></p>', '<p><br/></p>']: content['old_body'] = content['body'] streamfield.insert(0, { "type": "paragraph", "value": content['body'], "id": str(uuid3(UUID_NAMESPACE, content['body'])), }) # Append intro to beginning of streamfield if content['intro'] not in ['', '<p></p>', '<p><br/></p>']: streamfield.insert(0, { "type": "paragraph", "value": content['intro'], "id": str(uuid3(UUID_NAMESPACE, content['intro'])), }) # Save streamfield content with "body" key, as it was renamed as well in this migration content['body'] = json.dumps(streamfield) return content
def create_mock_resource_temaplate(): ### Resource to be reuqested for 'mock' resource_requests = {'compute': {}, 'network': {}} ###### mycompute-0 msg = rmgryang.VDUEventData_RequestInfo() msg.image_id = str(uuid.uuid3(uuid.NAMESPACE_DNS, 'image-0')) msg.vm_flavor.vcpu_count = 4 msg.vm_flavor.memory_mb = 8192 msg.vm_flavor.storage_gb = 40 resource_requests['compute']['mycompute-0'] = msg ###### mycompute-1 msg = rmgryang.VDUEventData_RequestInfo() msg.image_id = str(uuid.uuid3(uuid.NAMESPACE_DNS, 'image-1')) msg.vm_flavor.vcpu_count = 2 msg.vm_flavor.memory_mb = 8192 msg.vm_flavor.storage_gb = 20 resource_requests['compute']['mycompute-1'] = msg ####### mynet-0 msg = rmgryang.VirtualLinkEventData_RequestInfo() resource_requests['network']['mynet-0'] = msg ####### mynet-1 msg = rmgryang.VirtualLinkEventData_RequestInfo() resource_requests['network']['mynet-1'] = msg return resource_requests
def mock_data(fields): result = {} for f in fields: fname = f["name"] fval = f.get("default", NotImplemented) if fval is not NotImplemented: result[fname] = fval continue ftype = f.get("type", "string") f_id = abs(id(f)) if ftype == "string": result[fname] = uuid.uuid3(uuid.NAMESPACE_OID, str(f_id)).get_hex()[:8] elif ftype == "integer": result[fname] = f_id % 100 elif ftype == "float": result[fname] = f_id % 100 / 1.0 elif ftype == "uuid": result[fname] = uuid.uuid3(uuid.NAMESPACE_OID, str(f_id)).get_hex() elif ftype == "date": result[fname] = datetime.date.today().isoformat() elif ftype == "datetime": result[fname] = datetime.datetime.today().isoformat() elif ftype == "boolean": result[fname] = [True, False][f_id % 2] elif ftype.endswith("list"): result[fname] = [] return result
def _memoize_make_version_hash(self): if self.namespace.startswith('http'): UUID = uuid.uuid3(uuid.NAMESPACE_URL, self.namespace) if self.namespace: UUID = uuid.uuid3(uuid.NAMESPACE_DNS, self.namespace) else: UUID = uuid.uuid4() return base64.b64encode(UUID.bytes)[:6].decode(ENCODING)
def _set_identifiers(self): self.identifiers = [] entry_uuid = None if self.isbn is not None: entry_uuid = uuid.uuid3(self.uuid_master, isbn) self.identifiers.append('urn:isbn:%s' % isbn) else: entry_uuid = uuid.uuid3(self.uuid_master, ''.join(self.authors) + self.title) self.urn = 'urn:uuid:%s' % entry_uuid
def get_data(self, index=None): if index == None: index = self.mutation_index valuesize = self.valuesize_sequence[index % len(self.valuesize_sequence)] if self.cache_data: if not valuesize in self.data_cache: self.data_cache[valuesize] = (str(uuid.uuid3(self.uuid,`index`)) * (1+valuesize/36))[:valuesize] return `index` + self.data_cache[valuesize] else: return (str(uuid.uuid3(self.uuid,`index`)) * (1+valuesize/36))[:valuesize]
def save(self, commit=True): # Save the provided password in hashed format user = super(UserCreationForm, self).save(commit=False) user.set_password(self.cleaned_data["password1"]) user.userid = str(uuid.uuid3(uuid.uuid4(), str(time.time())).hex) user.usignature = str(uuid.uuid3(uuid.uuid4(), str(time.time())).hex) user.clientid = str(uuid.uuid3(uuid.uuid4(), str(time.time())).hex) if commit: user.save() return user
def get_uuid_code(): """ uuid模块生成随机码 :return: 随机码 """ print(uuid.uuid3(uuid.NAMESPACE_DNS, 'practice_0001.py')) # 基于MD5值 print(uuid.uuid4()) # 随机uuid print(uuid.uuid5(uuid.NAMESPACE_DNS, 'practice_0001.py')) # 基于SHA-1值 for ui in range(10): print(uuid.uuid3(uuid.NAMESPACE_DNS, '{}'.format(ui))) return uuid.uuid1()
def tc_today(n=1): """ generate n table codes for today """ for i in range(0,n): tcode = str(uuid.uuid3(uuid.uuid1(), 'digital menu'))[:4] if tcode == 'dba5': tcode = str(uuid.uuid3(uuid.uuid1(), 'digital menu'))[:4] # insert this table code to use tc = TableCode(code=tcode, date=date.today()) tc.save()
def get_json_data(self, index=None): if index == None: index = self.mutation_index valuesize = self.valuesize_sequence[index % len(self.valuesize_sequence)] if self.cache_data: if not valuesize in self.data_cache: self.data_cache[valuesize] = (str(uuid3(self.uuid,`index`)) * (1+valuesize/36))[:valuesize] return json.dumps({'index':index,'data':self.data_cache[valuesize],'size':valuesize}) else: return json.dumps({'index':index,'data':(str(uuid3(self.uuid,`index`)) * (1+valuesize/36))[:valuesize],'size':valuesize})
def test_uuid3(self): equal = self.assertEqual # Test some known version-3 UUIDs. for u, v in [ (uuid.uuid3(uuid.NAMESPACE_DNS, "python.org"), "6fa459ea-ee8a-3ca4-894e-db77e160355e"), (uuid.uuid3(uuid.NAMESPACE_URL, "http://python.org/"), "9fe8e8c4-aaa8-32a9-a55c-4535a88b748d"), (uuid.uuid3(uuid.NAMESPACE_OID, "1.3.6.1"), "dd1a1cef-13d5-368a-ad82-eca71acd4cd1"), (uuid.uuid3(uuid.NAMESPACE_X500, "c=ca"), "658d3002-db6b-3040-a1d1-8ddd7d189a4d"), ]: equal(u.variant, uuid.RFC_4122) equal(u.version, 3) equal(u, uuid.UUID(v)) equal(str(u), v)
def generateUuid(self, email_id, machine_name): """ return a uuid which uniquely identifies machinename and email id """ uuidstr = None if machine_name not in self.d: myNamespace = uuid.uuid3(uuid.NAMESPACE_URL, machine_name) uuidstr = str(uuid.uuid3(myNamespace, email_id)) self.d[machine_name] = (machine_name, uuidstr, email_id) self.d[uuidstr] = (machine_name, uuidstr ,email_id) else: (machine_name, uuidstr, email_id) = self.d[machine_name] return uuidstr
def generate_id(self, email, password): host_id = None if email not in self.db: name = uuid.uuid3(uuid.NAMESPACE_URL, email) host_id = str(uuid.uuid3(name, email)) self.db[email] = (email, password, host_id) self.db[host_id] = (email, password, host_id) else: (email, password, host_id) = self.db[email] return host_id
def _key_to_id(self, key): """ Converts Ecospold01 "number" attributes to UUIDs using the internal UUID namespace. :param key: :return: """ if isinstance(key, int): key = str(key) u = to_uuid(key) if u is not None: return u if six.PY2: return uuid.uuid3(self._ns_uuid, key.encode('utf-8')) else: return uuid.uuid3(self._ns_uuid, key)
def tc_days(n=30,per_day=10): # generate table codes per day for i in range(0,n): valid_day = date.today()+timedelta(i) tc = TableCode(code='dba5', date=valid_day) tc.save() for j in range(0,per_day): # 10 table codes per day tcode = str(uuid.uuid3(uuid.uuid1(), 'digital menu'))[:4] if tcode == 'dba5': # regenerate since it collides with default tcode = str(uuid.uuid3(uuid.uuid1(), 'digital menu'))[:4] # insert this table code to use tc = TableCode(code=tcode, date=valid_day) tc.save()
def DoubanMoviePic(header, body, SendMsgFunc): soup = BeautifulSoup(body) nodelist = soup('div', attrs={"class" : "mod"}) for node in nodelist: pictype = None if node.h2: if node.h2.text.startswith(u'海报'): pictype = '1' rawpictype = '2' elif node.h2.text.startswith(u'剧照'): pictype = '3' rawpictype = '4' if pictype: piclist = node('img', attrs={"src" : RE_PIC_SRC}) for pic in piclist: row = {} picid = RE_PIC_SRC.search(pic['src']).group('picid') row['CONTENT_IMG_ID'] = str(uuid.uuid3(uuid.NAMESPACE_DNS, ('Douban.rawpic.%s' % picid).encode('utf8'))) row['CONTENT_ID'] = header['ContentId'] row['SOURCE_URL'] = header['Url'] row['IMG_URL'] = pic['src'].replace('/albumicon/', '/raw/') row['IMG_NAME'] = None row['IMG_TYPE'] = rawpictype Insert('CONTENT_IMG', row, SendMsgFunc)
def _remote_fetch(env, url, out_file=None, allow_fail=False): """Retrieve url using wget, performing download in a temporary directory. Provides a central location to handle retrieval issues and avoid using interrupted downloads. """ if out_file is None: out_file = os.path.basename(url) if not env.safe_exists(out_file): orig_dir = env.safe_run_output("pwd").strip() temp_ext = "/%s" % uuid.uuid3(uuid.NAMESPACE_URL, str("file://%s/%s/%s/%s" % (env.host, socket.gethostname(), datetime.datetime.now().isoformat(), out_file))) with _make_tmp_dir(ext=temp_ext) as tmp_dir: with cd(tmp_dir): with warn_only(): result = env.safe_run("wget --no-check-certificate -O %s '%s'" % (out_file, url)) if result.succeeded: env.safe_run("mv %s %s" % (out_file, orig_dir)) elif allow_fail: out_file = None else: raise IOError("Failure to retrieve remote file") return out_file
def test_daily_stats(self): """ checking if there are the statistics of the day """ client = Client() client.login(username='******', password='******') resources = resourceInfoType_model.objects.all() for resource in resources: resource.storage_object.publication_status = INGESTED resource.storage_object.save() client.post(ADMINROOT, \ {"action": "publish_action", ACTION_CHECKBOX_NAME: resource.id}, \ follow=True) # get stats days date response = client.get('/{0}stats/days'.format(DJANGO_BASE)) self.assertEquals(200, response.status_code) # get stats info of the node response = client.get('/{0}stats/get'.format(DJANGO_BASE)) self.assertEquals(200, response.status_code) self.assertContains(response, "lrcount") self.assertNotContains(response, "usagestats") # get full stats info of the node response = client.get('/{0}stats/get/?statsid={1}'.format(DJANGO_BASE, str(uuid.uuid3(uuid.NAMESPACE_DNS, STORAGE_PATH)))) self.assertEquals(200, response.status_code) self.assertContains(response, "usagestats")
def insert(self, metadata): instance = self.model() try: if metadata.has_key('code') == False and hasattr(instance, 'code'): if self.tablename in ("roles", "users"): name = metadata.get("role_name") if metadata.has_key("role_name") else metadata.get("username") setattr(instance, 'code', str(uuid.uuid3(uuid.NAMESPACE_DNS, str(name)))) else: setattr(instance, 'code', str(uuid.uuid4())) for key, value in metadata.items(): if hasattr(instance, key): setattr(instance, key, value) self.session.add(instance) # Flush self.session.flush() # To Dict result = self.to_dict(instance.__dict__) # Commit self.session.commit() result["errorcode"] = 1 except: self.logging_error() # print(traceback.format_exc()) self.session.rollback() result = {'errorcode':0} return result
def make_ical(data, sources): calweek_regex = re.compile(r'^(\d+)\. KW$') time_regex = re.compile(r'^(\d+)\.(\d+) - (\d+)\.(\d+)$') room_regex = re.compile(r'^(.*) - (.*)$') times = {} for time in data[0]['order']: matches = time_regex.match(time) if not matches: raise CannotParseTime("String was: %s" % time) newtime = {'start': rd.relativedelta(hour=int(matches.group(1)), minute=int(matches.group(2))), 'end': rd.relativedelta(hour=int(matches.group(3)), minute=int(matches.group(4)))} times[time] = newtime calendar = vobject.iCalendar() cat_map = {u"V": u"Vorlesung", u"Ü": u"Übung", u"P": u"Praktikum"} begin_date = None for week in data: if not begin_date: calweek = calweek_regex.match(week['week']) if not calweek: raise CannotParseCalweek("String was: %s" % week['week']) calweek = int(calweek.group(1)) begin_date = datetime.now() + rd.relativedelta(month=1, day=4, weekday=rd.MO(-1), weeks=+(calweek - 1), hour=0, minute=0, second=0, microsecond=0) else: begin_date = begin_date + rd.relativedelta(weeks=+1) for day in range(0,5): day_data = week['data'][day] day_date = begin_date + rd.relativedelta(days=+day) for time in day_data: for entry in day_data[time]: event = calendar.add('vevent') event.add('dtstart').value = day_date + times[time]["start"] event.add('dtend').value = day_date + times[time]["end"] cat = "" if entry["typ"][0] in cat_map: event.add('categories').value = ["UNI:" + cat_map[entry["typ"][0]]] cat = " (%s)" % cat_map[entry["typ"][0]] teacher = entry["room"] room_match = room_regex.match(entry["room"]) if room_match: event.add('location').value = room_match.group(1).strip() teacher = room_match.group(2) event.add('summary').value = "%s%s" % (entry['name'], cat) event.add('description').value = u"Kürzel: %s\nDozent: %s\nVeranstaltungsdyp: %s\nQuelle:%s" % (entry["short"], teacher, entry["typ"], sources[entry['source']].string) uid = uuid.uuid3(uuid.NAMESPACE_DNS, '%s %s' % (str(event.location.value), str(event.dtstart.value))) event.add("uid").value = str(uid) return calendar.serialize()
def get_links(self): """get all the news links in the page """ soup = BeautifulSoup(self.page) vote = 0 infos = [] links = [] for link in soup.find_all('a'): l = link['href'] if l.startswith('vote'): vote = 1 elif vote == 1: if l.startswith("item"): l = "%s/%s" % (self.surl, l) infos = [Markup.escape(link.string), Markup.escape(l.strip()), date_internet(datetime.now())] time.sleep(1) vote = 2 elif l.startswith('item') and vote == 2: infos.append("%s/%s" % (self.surl, l)) infos.append(uuid3(NAMESPACE_DNS, infos[1])) links.append(infos) vote = 0 return links
def track_call(api_action, api_label, x_tba_app_id): """ For more information about GAnalytics Protocol Parameters, visit https://developers.google.com/analytics/devguides/collection/protocol/v1/parameters """ analytics_id = Sitevar.get_by_id("google_analytics.id") if analytics_id is None: logging.warning("Missing sitevar: google_analytics.id. Can't track API usage.") else: GOOGLE_ANALYTICS_ID = analytics_id.contents['GOOGLE_ANALYTICS_ID'] params = urllib.urlencode({ 'v': 1, 'tid': GOOGLE_ANALYTICS_ID, 'cid': uuid.uuid3(uuid.NAMESPACE_X500, str(x_tba_app_id)), 't': 'event', 'ec': 'api-v02', 'ea': api_action, 'el': api_label, 'cd1': x_tba_app_id, # custom dimension 1 'ni': 1, 'sc': 'end', # forces tracking session to end }) analytics_url = 'http://www.google-analytics.com/collect?%s' % params urlfetch.fetch( url=analytics_url, method=urlfetch.GET, deadline=10, )
def track_call(api_action, api_details, x_tba_app_id): analytics_id = Sitevar.get_by_id("google_analytics.id") if analytics_id is None: logging.warning("Missing sitevar: google_analytics.id. Can't track API usage.") else: GOOGLE_ANALYTICS_ID = analytics_id.contents['GOOGLE_ANALYTICS_ID'] params = urllib.urlencode({ 'v': 1, 'tid': GOOGLE_ANALYTICS_ID, 'cid': uuid.uuid3(uuid.NAMESPACE_X500, str(x_tba_app_id)), 't': 'event', 'ec': 'api', 'ea': api_action, 'el': api_details, 'cd1': x_tba_app_id, # custom dimension 1 'ni': 1, 'sc': 'end', # forces tracking session to end }) # Sets up the call analytics_url = 'http://www.google-analytics.com/collect?%s' % params urlfetch.fetch( url=analytics_url, method=urlfetch.GET, deadline=10, )
def track_notification(self, notification_type_enum, num_keys): """ For more information about GAnalytics Protocol Parameters, visit https://developers.google.com/analytics/devguides/collection/protocol/v1/parameters """ analytics_id = Sitevar.get_by_id("google_analytics.id") if analytics_id is None: logging.warning("Missing sitevar: google_analytics.id. Can't track API usage.") else: GOOGLE_ANALYTICS_ID = analytics_id.contents['GOOGLE_ANALYTICS_ID'] params = urllib.urlencode({ 'v': 1, 'tid': GOOGLE_ANALYTICS_ID, 'cid': uuid.uuid3(uuid.NAMESPACE_X500, str('tba-notification-tracking')), 't': 'event', 'ec': 'notification', 'ea': NotificationType.type_names[notification_type_enum], 'ev': num_keys, 'ni': 1, 'sc': 'end', # forces tracking session to end }) analytics_url = 'http://www.google-analytics.com/collect?%s' % params urlfetch.fetch( url=analytics_url, method=urlfetch.GET, deadline=10, )
def createUser(self): """Create a new user""" uid = str(uuid.uuid3(uuid.NAMESPACE_DNS, self.username + str(time.time() ))) print uid self.cur.execute("INSERT INTO users VALUES(?,?,?)", (self.username, self.password, uid)) self.con.commit() return True
class MySpider(spider.Spider): def __init__(self, proxy_enable=False, proxy_max_num=setting.PROXY_MAX_NUM, timeout=setting.HTTP_TIMEOUT, cmd_args=None): spider.Spider.__init__(self, proxy_enable, proxy_max_num, timeout=timeout, cmd_args=cmd_args) # 网站名称 self.siteName = "海南省公共资源交易中心" # 类别码,01新闻、02论坛、03博客、04微博 05平媒 06微信 07 视频、99搜索引擎 self.info_flag = "99" # 入口地址列表 # self.start_urls = ["http://www.bidcenter.com.cn/viplist-1.html"] self.start_urls = ["http://zw.hainan.gov.cn"] self.encoding = 'utf-8' self.site_domain = 'hainan.gov.cn' self.dedup_uri = None self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Connection": "keep-alive", # "Cookie": "_gscu_1811078948=41663282vx6ent53; ASP.NET_SessionId=x01pkg455lrliaiky4l3av45; _gscbrs_1811078948=1; _gscs_1811078948=t42005267lfkkmo53|pv:3; cookies=89314150", # bidguid=96984f59-dcbf-491f-9bab-a8500ea4f12d; UM_distinctid=16492ad9f0ba4-0c0a2c42949499-444a002e-1fa400-16492ad9f0c2aa; _uab_collina=153146928107553947905014; _umdata=BA335E4DD2FD504F1EDA57F02CFE1964FF30093E1A99816EA3422927037FEEE27E6061217D847EA9CD43AD3E795C914CF0452994C1509D8EB7661DBFB2FCDD56; isshowtcc=isshowtcc; BIDCTER_USERNAME=UserName=jingyingbu666; keywords=%u601D%u79D1; keywords==%e6%80%9d%e7%a7%91; CNZZDATA888048=cnzz_eid%3D1104734771-1531464092-%26ntime%3D1531696377; Hm_lvt_9954aa2d605277c3e24cb76809e2f856=1531469210,1531700960; Hm_lpvt_9954aa2d605277c3e24cb76809e2f856=1531701399; aspcn=id=1277449&name=jingyingbu666&vip=3&company=%e8%8b%8f%e4%ba%a4%e7%a7%91%e9%9b%86%e5%9b%a2%e8%82%a1%e4%bb%bd%e6%9c%89%e9%99%90%e5%85%ac%e5%8f%b8&lianxiren=%e6%b8%b8%e7%8e%89%e7%9f%b3&tel=025-86577542&[email protected]&diqu=&Token=65D51EA060022C3EFFD2BE6B4C79852284FE102150132499D822DB4759BA5217232FAA3570FA85C59F0D4B7BA2A98C4B; PASSKEY=Token=65D51EA060022C3EFFD2BE6B4C79852284FE102150132499D822DB4759BA5217232FAA3570FA85C59F0D4B7BA2A98C4B', # "Host": "www.fjggzyjy.cn", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", # "Content-Type":"application/x-www-form-urlencoded", # "Referer":"https://www.bidcenter.com.cn", # 'Cookie': 'ASP.NET_SessionId=edvtzkuc3fir5uo0dgd33pwl; UM_distinctid=166e2d98409596-08af12546c38b9-12656e4a-1fa400-166e2d9840a47e; CNZZDATA888048=cnzz_eid%3D758459646-1541404197-%26ntime%3D1541404197; Hm_lvt_9954aa2d605277c3e24cb76809e2f856=1541404198; Hm_lpvt_9954aa2d605277c3e24cb76809e2f856=1541404198', } # self.proxy_enable = "http://spider-ip-sync.istarshine.net.cn/proxy_100ms.txt" #self.proxy_url = 'http://spider-ip-sync.istarshine.net.cn/proxy_100ms.txt' self.request_headers = {'headers': self.headers} self.conn_config = redis.StrictRedis.from_url('redis://192.168.1.34/1') redis_ip = self.conn_config.get("redis_ip") redis_db = self.conn_config.get("redis_db") mysql_ip = self.conn_config.get("mysql_ip") mysql_databases = self.conn_config.get("mysql_databases") mysql_username = self.conn_config.get("mysql_username") mysql_password = self.conn_config.get("mysql_password") mysql_list_info = self.conn_config.get("mysql_list_info") try: self.conn = redis.StrictRedis.from_url('redis://{0}/{1}'.format( redis_ip, redis_db)) except: self.conn = None # self.db = DB ().create ('mysql://*****:*****@192.168.20.247:3306/hbdx') self.db = DB().create('mysql://{0}:{1}@{2}:3306/{3}'.format( mysql_username, mysql_password, mysql_ip, mysql_databases)) # self.db = DB ().create ('mysql://*****:*****@localhost:3306/sjk') self.table = mysql_list_info def get_start_urls(self, data=None): ''' 返回start_urls ''' return self.start_urls def parse(self, response, url): # try: # # response.encoding = self.encoding # # unicode_html_body = response.text # # data = htmlparser.Parser(unicode_html_body) # except Exception, e: # return ([], None, None) url_list = [ 'http://zw.hainan.gov.cn/ggzy/ggzy/jgzbgg/index.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/jgzbgg/index_2.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/jgzbgg/index_3.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cggg/index.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cggg/index_2.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cggg/index_3.jhtml', # 工程建设招标公告 'http://zw.hainan.gov.cn/ggzy/ggzy/jgzbgs/index.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/jgzbgs/index_2.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/jgzbgs/index_3.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cgzbgg/index.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cgzbgg/index_2.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cgzbgg/index_3.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cgzbgg/index_4.jhtml', # 工程建设中标公告 ] return (url_list, None, None) def parse_detail_page(self, response=None, url=None): try: response.encoding = self.encoding unicode_html_body = response.text data = htmlparser.Parser(unicode_html_body) except Exception, e: return [] from_tag_url = response.url print from_tag_url # print unicode_html_body zhaobgg = [ 'http://zw.hainan.gov.cn/ggzy/ggzy/jgzbgg/index.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/jgzbgg/index_2.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/jgzbgg/index_3.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cggg/index.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cggg/index_2.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cggg/index_3.jhtml', ] zhongbgg = [ 'http://zw.hainan.gov.cn/ggzy/ggzy/jgzbgs/index.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/jgzbgs/index_2.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/jgzbgs/index_3.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cgzbgg/index.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cgzbgg/index_2.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cgzbgg/index_3.jhtml', 'http://zw.hainan.gov.cn/ggzy/ggzy/cgzbgg/index_4.jhtml', # 工程建设中标公告 ] if from_tag_url in zhaobgg: tag = "招标公告" elif from_tag_url in zhongbgg: tag = "中标公告" else: tag = "招标公告" # titles = re.findall('title="(.*?)"', unicode_html_body)[:-1] # dates = re.findall('<td align="center">(.*?)</td>', unicode_html_body)[1::2] # links = re.findall('<td align="left"><a href="(.*?)"', unicode_html_body) li_content = data.xpathall('''//table[@class="newtable"]//tr''') # for title,link,date in zip(titles,links,dates): for item in li_content: title = item.xpath('''//a/@title''').text().strip() link = item.xpath('''//a[@target="_blank"]/@href''').text().strip() date = item.xpath('''//td[4]/text()''').text().strip() date = str(date).replace("-", "") # titles = data.xpathall('//a[@class="btn btn-default article-list-single"]/@title') # links = data.xpathall('//a[@class="btn btn-default article-list-single"]/@href') # dates = data.xpathall('//span[@class="article-list-date"]/text()') if self.getdumps(link): continue link = str(link) uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, link)) + str( uuid.uuid3(uuid.NAMESPACE_DNS, link)) ctime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') location = "海南省" service = '' industry = "" post = { "uuid": uid, # md5 "detailUrl": link, # url "name": title, # 标题 "location": location, # 地区 "publicTime": date, # 公布时间 "tag": tag, # 标签 "site": self.site_domain, "siteName": self.siteName, "ctime": ctime, "industry": industry, "service": service } dic = self.handle_post(post) try: self.db.table(self.table).add(dic) except Exception as e: print e # str_post = json.dumps (post) return
def qiniufetch(url, file_name): headers = {"user_agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" } if 'http' in url: """ 使用代理池 """ # image_download = Proxy_contact(app_name='qiniufetch', method='get', url=url, headers=headers) # con = image_download.contact() # while True: # try: # proxy = proxies[random.randint(0, 9)] # res = requests.get(url, headers=headers, proxies=proxy, timeout=10) # 图片连接请求 # con = res.content # if res.status_code == 200: # break # else: # continue # except Exception as e: # print(e) # if con == False: # return None try: res = requests.get(url, headers=headers, timeout=10) con = res.content except: return False with open('article.jpg', 'wb') as f: f.write(con) else: try: img_url = 'http:' + url res = requests.get(img_url, headers=headers, timeout=10) con = res.content with open('article.jpg', 'wb') as f: f.write(con) except: log.info('图片格式不标准') return False filename = uuid.uuid3(uuid.NAMESPACE_DNS, file_name) # 需要填写你的 Access Key 和 Secret Key access_key = 'qjku2wyeTzY-yXiQ3JuTvkT87kn4OBdrA3VnK46e' secret_key = 'JHbwSYk-0e2GqzH8--H-AO9X12BiNYq-qbAdzLY7' # 构建鉴权对象 q = Auth(access_key, secret_key) # 要上传的空间 bucket_name = bucket # 上传到七牛后保存的文件名 key = str(filename) # 生成上传 Token,可以指定过期时间等 token = q.upload_token(bucket_name, key, 3600) # 要上传文件的本地路径 localfile = './article.jpg' ret, info = put_file(token, key, localfile) # print(info) while True: try: assert ret['key'] == key assert ret['hash'] == etag(localfile) break except: continue log.info('上传图片{}成功'.format(filename)) bucket_domain = 'http://image.fangjia.com' file_url = bucket_domain + "/" + str(filename) return file_url
def uuid_hash(address1, city, address2, state, country): return uuid.uuid3(uuid.NAMESPACE_DNS, address1 + city + address2 + state + country)