def indicator(self, i): if not i: self._indicator = None return if PYVERSION == 2: try: i = codecs.unicode_escape_encode(i.decode('utf-8'))[0] except Exception: i = codecs.unicode_escape_encode(i.encode('utf-8', 'ignore').decode('utf-8'))[0] i = i.lower() self.itype = resolve_itype(i) self._indicator = i if self.itype == 'url': u = urlparse(self._indicator) self._indicator = u.geturl().rstrip('/').lower() if self.itype == 'ipv4': self._indicator = ipv4_normalize(self._indicator) if self.mask and (self.itype in ['ipv4', 'ipv6']): self._indicator = '{}/{}'.format(self._indicator, int(self.mask)) self.mask = None
def indicator(self, i): if not i: self._indicator = None return if PYVERSION == 2: try: i = codecs.unicode_escape_encode(i.decode('utf-8'))[0] except Exception: i = codecs.unicode_escape_encode( i.encode('utf-8', 'ignore').decode('utf-8'))[0] i = i.lower() self.itype = resolve_itype(i) self._indicator = i if self.itype == 'url': u = urlparse(self._indicator) self._indicator = u.geturl().rstrip('/').lower() if self.itype == 'ipv4': self._indicator = ipv4_normalize(self._indicator) if self.mask and (self.itype in ['ipv4', 'ipv6']): self._indicator = '{}/{}'.format(self._indicator, int(self.mask)) self.mask = None
def get_prep_value(self, value): value = super(EncryptedFieldMixin, self).get_prep_value(value) if value is None or value == '' or self.decrypt_only: return value if isinstance(value, str): value = codecs.unicode_escape_encode(value)[0] else: value = codecs.unicode_escape_encode(str(value))[0] return self.prefix + self.crypter().encrypt(value)
def unicode_encode(self, string): """ unicode编码 """ try: result = codecs.unicode_escape_encode(string)[0] return result except UnicodeEncodeError: raise UnicodeEncodeError("Data can't be unicode_encoded!")
def __repr__(self): i = {} for k in FIELDS: v = getattr(self, k) # Handle confidence 0.0 if not v and not v == 0.0: continue if k == 'message': if PYVERSION == 2: v = codecs.unicode_escape_encode(v.decode('utf-8'))[0] else: v = v.encode('utf-8') v = b64encode(v).decode('utf-8') if k in FIELDS_TIME and isinstance(v, datetime): v = v.strftime("%Y-%m-%dT%H:%M:%S.%fZ") if isinstance(v, basestring): if k is not 'message' and not k.endswith( 'time') and self._lowercase is False: v = v.lower() if k == 'confidence': v = float(v) i[k] = v sort_keys = False indent = None if logging.getLogger('').getEffectiveLevel() == logging.DEBUG: sort_keys = True indent = 4 try: return json.dumps(i, indent=indent, sort_keys=sort_keys, separators=(',', ': ')) except UnicodeDecodeError as e: i['asn_desc'] = unicode(i['asn_desc'].decode('latin-1')) return json.dumps(i, indent=indent, sort_keys=sort_keys, separators=(',', ': '))
def pwtrans(string): lll = [] length = 0 for i in string: uni = codecs.unicode_escape_encode(i)[0] string = str(uni) if string.find(r"b'\\") == 0: string = string.replace(r"b'\\u", '') string = string.replace(r"'", '') lll.append(int('0x' + string[2:], 16)) lll.append(int('0x' + string[0:2], 16)) length += 2 else: string = string.replace(r"b'",'') string = string.replace(r"'", '') lll.extend([ord(string),0]) length += 0 return bytes(lll)
def test_codecs_builtins(self): s = "abc" encoded = codecs.utf_8_encode(s) self.assertEqual(s, codecs.utf_8_decode(encoded[0])[0]) encoded = codecs.utf_7_encode(s) self.assertEqual(s, codecs.utf_7_decode(encoded[0])[0]) encoded = codecs.utf_16_encode(s) self.assertEqual(s, codecs.utf_16_decode(encoded[0])[0]) encoded = codecs.utf_16_le_encode(s) self.assertEqual(s, codecs.utf_16_le_decode(encoded[0])[0]) encoded = codecs.utf_16_be_encode(s) self.assertEqual(s, codecs.utf_16_be_decode(encoded[0])[0]) encoded = codecs.utf_32_encode(s) self.assertEqual(s, codecs.utf_32_decode(encoded[0])[0]) encoded = codecs.utf_32_le_encode(s) self.assertEqual(s, codecs.utf_32_le_decode(encoded[0])[0]) encoded = codecs.utf_32_be_encode(s) self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0]) encoded = codecs.utf_32_be_encode(s) self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0]) encoded = codecs.raw_unicode_escape_encode(s) self.assertEqual(s, codecs.raw_unicode_escape_decode(encoded[0])[0]) encoded = codecs.unicode_escape_encode(s) self.assertEqual(s, codecs.unicode_escape_decode(encoded[0])[0]) encoded = codecs.latin_1_encode(s) self.assertEqual(s, codecs.latin_1_decode(encoded[0])[0]) encoded = codecs.ascii_encode(s) self.assertEqual(s, codecs.ascii_decode(encoded[0])[0])
def __repr__(self): i = {} for k in FIELDS: v = getattr(self, k) # Handle confidence 0.0 if not v and not v == 0.0: continue if k == 'message': if PYVERSION == 2: v = codecs.unicode_escape_encode(v.decode('utf-8'))[0] else: v = v.encode('utf-8') v = b64encode(v).decode('utf-8') if k in FIELDS_TIME and isinstance(v, datetime): v = v.strftime("%Y-%m-%dT%H:%M:%S.%fZ") if isinstance(v, basestring): if k is not 'message' and not k.endswith('time'): v = v.lower() if k == 'confidence': v = float(v) i[k] = v sort_keys = False indent = None if logging.getLogger('').getEffectiveLevel() == logging.DEBUG: sort_keys = True indent = 4 try: return json.dumps(i, indent=indent, sort_keys=sort_keys, separators=(',', ': ')) except UnicodeDecodeError as e: i['asn_desc'] = unicode(i['asn_desc'].decode('latin-1')) return json.dumps(i, indent=indent, sort_keys=sort_keys, separators=(',', ': '))
def get_words(url): # Get the HTML page = urllib2.urlopen(url) #print page # Parse out the text soup = BeautifulSoup(page) body = soup.find_all('p') # For all paragraphs text = ' '.join([paragraph.get_text() for paragraph in body]) #print text # Force-convert unicode to ascii to avoid problems later... text = codecs.unicode_escape_encode(text)[0] #print type(text) # Tokenize tokens = nltk.word_tokenize(text) #for token in tokens: print token # Done! return text, tokens
def encode(self, input, final=False): return codecs.unicode_escape_encode(input, self.errors)[0]
def update_event(self, inp=-1): self.set_output_val( 0, codecs.unicode_escape_encode(self.input(0), self.input(1)))
import urllib2 import itertools import codecs ### extract tweets from archive ### json_data=open('Human-Security.json') data = json.load(json_data) #store tweet text as CSV outfile = 'tweets'".csv" myfile = codecs.open(outfile, "wb", 'utf-8') w = csv.writer(myfile) for i in range(len(data['statuses'])): w.writerow(codecs.unicode_escape_encode(data['statuses'][i]['text'])) myfile.close() ### Most Mentioned users ### #find all @mentions in tweet text - and store as csv outfile = 'mentions1'".csv" myfile = codecs.open(outfile, "wb", 'utf-8') w = csv.writer(myfile) input = codecs.open('tweets.csv', "r", 'utf-8') text = input.read() regex = "(?=(@[\w]+))" for result in re.finditer(regex, text): w.writerow(codecs.unicode_escape_encode("".join(result.groups())))
def encodeLocal(x): if x is not None: return codecs.unicode_escape_encode(str(x))[0].decode("utf8") else: return None
def test_empty(self): self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0)) self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
def transition2(self, item): item = codecs.unicode_escape_encode(item[0]) item = codecs.escape_decode(item[0]) return item[0]
def u(x): byte_string, length = codecs.unicode_escape_encode(x) unicode_string, length = codecs.unicode_escape_decode(byte_string) return unicode_string
import json import urllib2 import itertools import codecs ### extract tweets from archive ### json_data = open('Human-Security.json') data = json.load(json_data) #store tweet text as CSV outfile = 'tweets' ".csv" myfile = codecs.open(outfile, "wb", 'utf-8') w = csv.writer(myfile) for i in range(len(data['statuses'])): w.writerow(codecs.unicode_escape_encode(data['statuses'][i]['text'])) myfile.close() ### Most Mentioned users ### #find all @mentions in tweet text - and store as csv outfile = 'mentions1' ".csv" myfile = codecs.open(outfile, "wb", 'utf-8') w = csv.writer(myfile) input = codecs.open('tweets.csv', "r", 'utf-8') text = input.read() regex = "(?=(@[\w]+))" for result in re.finditer(regex, text): w.writerow(codecs.unicode_escape_encode("".join(result.groups()))) myfile.close()
def encode(self, input, final = False): return codecs.unicode_escape_encode(input, self.errors)[0]
outfile = 'tweets_nonUTF'".csv" myfile = codecs.open(outfile, "a",) w = csv.writer(myfile) for i in range(len(data['statuses'])): w.writerow(codecs.unicode_escape_encode(data['statuses'][i]['text'])) myfile.close() ''' outfile = 'linkslist'".csv" myfile = codecs.open(outfile, "wb", 'utf-8') w = csv.writer(myfile) input = codecs.open('tweets_nonUTF.csv', "r", 'utf-8') text = input.read() links_regex = "(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)" for result in re.finditer(links_regex, text): w.writerow(codecs.unicode_escape_encode("".join(result.groups()))) myfile.close() #Open list of Links input = codecs.open('linkslist.csv', "r", 'utf-8') text = input.read() wordlist = text.split() #frequency of tags used outfile = 'links_frequency'".csv" myfile = codecs.open(outfile, "wb", 'utf-8') w = csv.writer(myfile) wordfreq = [wordlist.count(p) for p in wordlist]
outfile = 'tweets_nonUTF'".csv" myfile = codecs.open(outfile, "a",) w = csv.writer(myfile) for i in range(len(data['statuses'])): w.writerow(codecs.unicode_escape_encode(data['statuses'][i]['text'])) myfile.close() ''' outfile = 'linkslist' ".csv" myfile = codecs.open(outfile, "wb", 'utf-8') w = csv.writer(myfile) input = codecs.open('tweets_nonUTF.csv', "r", 'utf-8') text = input.read() links_regex = "(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)" for result in re.finditer(links_regex, text): w.writerow(codecs.unicode_escape_encode("".join(result.groups()))) myfile.close() #Open list of Links input = codecs.open('linkslist.csv', "r", 'utf-8') text = input.read() wordlist = text.split() #frequency of tags used outfile = 'links_frequency' ".csv" myfile = codecs.open(outfile, "wb", 'utf-8') w = csv.writer(myfile) wordfreq = [wordlist.count(p) for p in wordlist] dictionary = dict(zip(wordlist, wordfreq)) aux = [(dictionary[key], key) for key in dictionary]
def start_requests(self): try: # 因为使用phantomjs登陆非常麻烦,所有直接使用selenium进行有界面的环境进行爬取,登录步骤为在用户名和密码进行填写然后点击登录 browser = webdriver.Chrome() browser.get('http://weibo.com/') sleep(5) # 对用户名和密码的输入框进行填充,然后点击登录按钮 browser.find_element_by_id('loginname').clear() browser.find_element_by_id('loginname').send_keys( "*****@*****.**") browser.find_element_by_name('password').clear() browser.find_element_by_name('password').send_keys("465212") browser.find_element_by_xpath( '//*[@id="pl_login_form"]/div/div[3]/div[6]/a').send_keys( Keys.ENTER) sleep(10) # page需要区分是不是第一页,因为第一页的结构与其他页不同 divTemp = 0 start = 0 end = 0 for page in range(1, 50): # 搜索的关键词 页数 browser.get('http://s.weibo.com/weibo/摩拜&page=' + str(page)) sleep(5) self.fi.write(str(page) + '\n') if page == 1: divTemp = 3 start = 1 end = 19 else: divTemp = 1 start = 2 end = 20 for item in range(start, end): # text为正文信息,因为可能文字过长存在展开全文选项,需要特别处理 # 如果存在链接的话,会出现在该文字下的p标签中 try: textLink = browser.find_element_by_xpath( '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) + ']/div[' + str(item) + ']/div/div[1]/dl/div/div[3]/div[1]/p/a[1]') except: textLink = 0 # 如果p标签下没有该链接选项,则直接获取该标签下的文字,有些字符会出现错误,直接replace掉 if textLink == 0: text = browser.find_elements_by_xpath( '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) + ']/div[' + str(item) + ']/div/div[1]/dl/div/div[3]/div[1]/p') text = text[0].text.replace(u"#", "").replace( u" ", "").replace(u'​', "").replace(u'&#nbsp;', "") # 存在链接,但是可能又好几个,有的是各种活动的超级话题,如“#天气#”,所以通过遍历的方法来进行筛选 else: for j in range(2, 10): try: # 如果没有action-data,则不断循环,有之后则拿到退出 textLink = textLink.get_attribute( "action-data") if textLink is None: textLink = browser.find_element_by_xpath( '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) + ']/div[' + str(item) + ']/div/div[1]/dl/div/div[3]/div[1]/p/a[' + str(j) + ']') else: break except: pass # 如果为空取原text值,并做处理 if textLink is None: text = browser.find_elements_by_xpath( '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) + ']/div[' + str(item) + ']/div/div[1]/dl/div/div[3]/div[1]/p') text = text[0].text.replace(u"#", "").replace( u" ", "").replace(u'​', "").replace(u'&#nbsp;', "") # 如果不为空,通过分析网页和请求,获取到链接拼接的字符串并进行拼接,拼接后的url中就是全文信息,请求获得即可 else: textLink = "http://s.weibo.com/ajax/direct/morethan140?" + textLink try: resultTemp = urllib2.urlopen(textLink) jsonInfo = resultTemp.read() jsonFile = json.loads(jsonInfo) text = jsonFile['data']['html'].replace( u"#", "").replace(u" ", "").replace( u'​', "").replace(u'&#nbsp;', "") # 通过尝试,发现新浪有时候自己的链接会为空,手动点击展示全文也无效,这样的话直接取不展开的内容 except: try: text = browser.find_elements_by_xpath( '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) + ']/div[' + str(item) + ']/div/div[1]/dl/div/div[3]/div[1]/p') text = text[0].text.replace( u"#", "").replace(u" ", "").replace( u'​', "").replace(u'&#nbsp;', "") except: continue text = codecs.escape_decode( codecs.unicode_escape_encode(text)[0])[0] pattern = re.compile(r'\\U.{8}', re.U) text = re.sub(pattern, '', str(text)) text = codecs.unicode_escape_decode(text)[0] # 获取位置信息(可能没有) try: location = browser.find_elements_by_xpath( '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) + ']/div/div[' + str(item) + ']/div/div[1]/dl/div/div[3]/div[1]/p') location = location.find_element_by_class_name( "W_btn_c6") location = location.text.replace(u"​", "").replace(u"|", "") except: location = 0 # 获取用户名 userName = browser.find_element_by_xpath( '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) + ']/div[' + str(item) + ']/div/div[1]/dl/div/div[3]/div[1]/a[1]') userName = userName.text # 获取用户头像 userHead = browser.find_element_by_xpath( '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) + ']/div[' + str(item) + ']/div/div[1]/dl/div/div[2]/a/img') userHead = userHead.get_attribute("src") # 获取发送时间 sendTime = browser.find_element_by_xpath( '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) + ']/div[' + str(item) + ']/div/div[1]/dl/div/div[3]/div[2]/a[1]') sendTime = sendTime.text dt = datetime.datetime.strptime( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), "%Y-%m-%d %H:%M:%S") # 格式化时间信息,因为有xx秒前、xx分钟前、今天xxx等 if u"秒" in sendTime: sendTime = dt + datetime.timedelta( seconds=-int(sendTime.split(u"秒")[0])) sendTime = sendTime.strftime("%Y-%m-%d %H:%M:%S") elif u"分钟" in sendTime: sendTime = dt + datetime.timedelta( minutes=-int(sendTime.split(u"分钟")[0])) sendTime = sendTime.strftime("%Y-%m-%d %H:%M:%S") elif u"今天" in sendTime: sendTime = time.strftime( '%Y-%m-%d', time.localtime(time.time( ))) + " " + sendTime.split(u"今天")[1] + ":00" # 转换为标准时间 else: if len(sendTime.split(u"月")[0]) == 1: if len(sendTime.split(u"月")[1].split(u"日") [0]) == 1: sendTime = time.strftime('%Y', time.localtime(time.time())) + "-0" + \ sendTime.split(u"月")[0] + "-0" + sendTime.split(u"月")[1].split(u"日")[ 0] + " " + sendTime.split(u"日")[1] + ":00" else: sendTime = time.strftime('%Y', time.localtime(time.time())) + "-0" + \ sendTime.split(u"月")[0] + "-" + sendTime.split(u"月")[1].split(u"日")[ 0] + " " + sendTime.split(u"日")[1] + ":00" else: if len(sendTime.split(u"月")[1].split(u"日") [0]) == 1: sendTime = time.strftime('%Y', time.localtime(time.time())) + "-" + \ sendTime.split(u"月")[0] + "-0" + sendTime.split(u"月")[1].split(u"日")[ 0] + " " + sendTime.split(u"日")[1] + ":00" else: sendTime = time.strftime('%Y', time.localtime(time.time())) + "-" + \ sendTime.split(u"月")[0] + "-" + sendTime.split(u"月")[1].split(u"日")[ 0] + " " + sendTime.split(u"日")[1] + ":00" #使用什么终端发送的微博 try: sendFrom = browser.find_element_by_xpath( '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) + ']/div[' + str(item) + ']/div/div[1]/dl/div/div[3]/div[2]/a[2]') sendFrom = sendFrom.text except: sendFrom = u"无" # 分享、评论、点赞数(三个数字的结构不一样) shareCount = browser.find_element_by_xpath( '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) + ']/div[' + str(item) + ']/div/div[2]/ul/li[2]/a/span/em') shareCount = shareCount.text if len(shareCount) == 0: shareCount = "0" try: commondCount = browser.find_element_by_xpath( '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) + ']/div[' + str(item) + ']/div/div[2]/ul/li[3]/a/span/em') commondCount = commondCount.text except: commondCount = "0" goodCount = browser.find_element_by_xpath( '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) + ']/div[' + str(item) + ']/div/div[2]/ul/li[4]/a/span/em') goodCount = goodCount.text if len(goodCount) == 0: goodCount = "0" # 插入数据库 try: if location == 0: self.coll_weiboHighTemperature.insert({ "userName": userName, "userHead": userHead, "text": text, "sendTime": sendTime, "sendFrom": sendFrom, "shareCount": shareCount, "commondCount": commondCount, "goodCount": goodCount }) self.fi2.write(userName + '\n' + userHead + '\n' + text + '\n' + sendTime + '\n' + sendFrom + '\n' + shareCount + '\n' + commondCount + '\n' + goodCount + '\n') else: self.coll_weiboHighTemperature.insert({ "userName": userName, "userHead": userHead, "text": text, "sendTime": sendTime, "sendFrom": sendFrom, "shareCount": shareCount, "commondCount": commondCount, "goodCount": goodCount, "location": location }) self.fi2.write(userName + '\n' + userHead + '\n' + text + '\n' + sendTime + '\n' + sendFrom + '\n' + shareCount + '\n' + commondCount + '\n' + goodCount + '\n' + location + '\n') print str(page) + str(item) except: # 特殊字符多为手机自带的特殊字符 print str(page) + str(item) sleep(5) except BaseException, e: self.logger.error("ERROR : start " + repr(e)) self.close(self.name, repr(e))
# Skip if the word is not unique if word not in words_unique: continue # Retrieve more words from wikipedia try: more_words = get_wiki_words(word) print "Retrieval successful for", word except Exception: print "Retrieval failed for", word continue # Get rid of unicode issues more_words = [ codecs.unicode_escape_encode(w)[0] for w in more_words ] # Clean new words by means of clean_words() from extract_words.py more_words = clean_words(more_words) # Keep the words all_new_words = all_new_words + more_words # Add new words to the words_dict words_dict[name] = sorted(words_dict[name] + all_new_words) # Report print "### RETRIEVAL COMPLETE FOR GROUP", group_number + 1, 'OF', len( words_dict), '(' + name + ')\n'