Python unicode_escape_encodeの例、codecs.unicode_escape_encode Pythonの例

コード例 #1

0

ファイルを表示

ファイル: indicator.py プロジェクト: csirtgadgets/csirtg-indicator-py

    def indicator(self, i):
        if not i:
            self._indicator = None
            return

        if PYVERSION == 2:
            try:
                i = codecs.unicode_escape_encode(i.decode('utf-8'))[0]
            except Exception:
                i = codecs.unicode_escape_encode(i.encode('utf-8', 'ignore').decode('utf-8'))[0]

        i = i.lower()
        self.itype = resolve_itype(i)
        self._indicator = i

        if self.itype == 'url':
            u = urlparse(self._indicator)
            self._indicator = u.geturl().rstrip('/').lower()

        if self.itype == 'ipv4':
            self._indicator = ipv4_normalize(self._indicator)

        if self.mask and (self.itype in ['ipv4', 'ipv6']):
            self._indicator = '{}/{}'.format(self._indicator, int(self.mask))
            self.mask = None

コード例 #2

0

ファイルを表示

ファイル: indicator.py プロジェクト: dylanjacob/csirtg-indicator-py

    def indicator(self, i):
        if not i:
            self._indicator = None
            return

        if PYVERSION == 2:
            try:
                i = codecs.unicode_escape_encode(i.decode('utf-8'))[0]
            except Exception:
                i = codecs.unicode_escape_encode(
                    i.encode('utf-8', 'ignore').decode('utf-8'))[0]

        i = i.lower()
        self.itype = resolve_itype(i)
        self._indicator = i

        if self.itype == 'url':
            u = urlparse(self._indicator)
            self._indicator = u.geturl().rstrip('/').lower()

        if self.itype == 'ipv4':
            self._indicator = ipv4_normalize(self._indicator)

        if self.mask and (self.itype in ['ipv4', 'ipv6']):
            self._indicator = '{}/{}'.format(self._indicator, int(self.mask))
            self.mask = None

コード例 #3

0

ファイルを表示

    def get_prep_value(self, value):
        value = super(EncryptedFieldMixin, self).get_prep_value(value)

        if value is None or value == '' or self.decrypt_only:
            return value

        if isinstance(value, str):
            value = codecs.unicode_escape_encode(value)[0]
        else:
            value = codecs.unicode_escape_encode(str(value))[0]

        return self.prefix + self.crypter().encrypt(value)

コード例 #4

0

ファイルを表示

ファイル: coding.py プロジェクト: rufherg/Codec

 def unicode_encode(self, string):
     """
     unicode编码
     """
     try:
         result = codecs.unicode_escape_encode(string)[0]
         return result
     except UnicodeEncodeError:
         raise UnicodeEncodeError("Data can't be unicode_encoded!")

コード例 #5

0

ファイルを表示

ファイル: indicator.py プロジェクト: sfinlon/csirtg-indicator-py-v1

    def __repr__(self):
        i = {}
        for k in FIELDS:

            v = getattr(self, k)
            # Handle confidence 0.0
            if not v and not v == 0.0:
                continue

            if k == 'message':
                if PYVERSION == 2:
                    v = codecs.unicode_escape_encode(v.decode('utf-8'))[0]
                else:
                    v = v.encode('utf-8')

                v = b64encode(v).decode('utf-8')

            if k in FIELDS_TIME and isinstance(v, datetime):
                v = v.strftime("%Y-%m-%dT%H:%M:%S.%fZ")

            if isinstance(v, basestring):
                if k is not 'message' and not k.endswith(
                        'time') and self._lowercase is False:
                    v = v.lower()

            if k == 'confidence':
                v = float(v)

            i[k] = v

        sort_keys = False
        indent = None
        if logging.getLogger('').getEffectiveLevel() == logging.DEBUG:
            sort_keys = True
            indent = 4
        try:
            return json.dumps(i,
                              indent=indent,
                              sort_keys=sort_keys,
                              separators=(',', ': '))
        except UnicodeDecodeError as e:
            i['asn_desc'] = unicode(i['asn_desc'].decode('latin-1'))
            return json.dumps(i,
                              indent=indent,
                              sort_keys=sort_keys,
                              separators=(',', ': '))

コード例 #6

0

ファイルを表示

ファイル: pwUnicode.py プロジェクト: Johnny0710/JadeDynasty

def pwtrans(string):
    lll = []
    length = 0
    for i in string:
        uni = codecs.unicode_escape_encode(i)[0]
        string = str(uni)
        if string.find(r"b'\\") == 0:
            string = string.replace(r"b'\\u", '')
            string = string.replace(r"'", '')
            lll.append(int('0x' + string[2:], 16))
            lll.append(int('0x' + string[0:2], 16))
            length += 2

        else:
            string = string.replace(r"b'",'')
            string = string.replace(r"'", '')
            lll.extend([ord(string),0])
            length += 0

    return bytes(lll)

コード例 #7

0

ファイルを表示

ファイル: test_codecs.py プロジェクト: vishalbelsare/graalpython

    def test_codecs_builtins(self):
        s = "abc"

        encoded = codecs.utf_8_encode(s)
        self.assertEqual(s, codecs.utf_8_decode(encoded[0])[0])

        encoded = codecs.utf_7_encode(s)
        self.assertEqual(s, codecs.utf_7_decode(encoded[0])[0])

        encoded = codecs.utf_16_encode(s)
        self.assertEqual(s, codecs.utf_16_decode(encoded[0])[0])

        encoded = codecs.utf_16_le_encode(s)
        self.assertEqual(s, codecs.utf_16_le_decode(encoded[0])[0])

        encoded = codecs.utf_16_be_encode(s)
        self.assertEqual(s, codecs.utf_16_be_decode(encoded[0])[0])

        encoded = codecs.utf_32_encode(s)
        self.assertEqual(s, codecs.utf_32_decode(encoded[0])[0])

        encoded = codecs.utf_32_le_encode(s)
        self.assertEqual(s, codecs.utf_32_le_decode(encoded[0])[0])

        encoded = codecs.utf_32_be_encode(s)
        self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0])

        encoded = codecs.utf_32_be_encode(s)
        self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0])

        encoded = codecs.raw_unicode_escape_encode(s)
        self.assertEqual(s, codecs.raw_unicode_escape_decode(encoded[0])[0])

        encoded = codecs.unicode_escape_encode(s)
        self.assertEqual(s, codecs.unicode_escape_decode(encoded[0])[0])

        encoded = codecs.latin_1_encode(s)
        self.assertEqual(s, codecs.latin_1_decode(encoded[0])[0])

        encoded = codecs.ascii_encode(s)
        self.assertEqual(s, codecs.ascii_decode(encoded[0])[0])

コード例 #8

0

ファイルを表示

ファイル: indicator.py プロジェクト: csirtgadgets/csirtg-indicator-py

    def __repr__(self):
        i = {}
        for k in FIELDS:

            v = getattr(self, k)
            # Handle confidence 0.0
            if not v and not v == 0.0:
                continue

            if k == 'message':
                if PYVERSION == 2:
                    v = codecs.unicode_escape_encode(v.decode('utf-8'))[0]
                else:
                    v = v.encode('utf-8')

                v = b64encode(v).decode('utf-8')

            if k in FIELDS_TIME and isinstance(v, datetime):
                v = v.strftime("%Y-%m-%dT%H:%M:%S.%fZ")

            if isinstance(v, basestring):
                if k is not 'message' and not k.endswith('time'):
                    v = v.lower()

            if k == 'confidence':
                v = float(v)

            i[k] = v

        sort_keys = False
        indent = None
        if logging.getLogger('').getEffectiveLevel() == logging.DEBUG:
            sort_keys = True
            indent = 4
        try:
            return json.dumps(i, indent=indent, sort_keys=sort_keys, separators=(',', ': '))
        except UnicodeDecodeError as e:
            i['asn_desc'] = unicode(i['asn_desc'].decode('latin-1'))
            return json.dumps(i, indent=indent, sort_keys=sort_keys, separators=(',', ': '))

コード例 #9

0

ファイルを表示

ファイル: extract_words.py プロジェクト: ecolstat/EMBL-CodingClub

def get_words(url):

    # Get the HTML
    page = urllib2.urlopen(url)
    #print page

    # Parse out the text
    soup = BeautifulSoup(page)
    body = soup.find_all('p')  # For all paragraphs
    text = ' '.join([paragraph.get_text() for paragraph in body])
    #print text

    # Force-convert unicode to ascii to avoid problems later...
    text = codecs.unicode_escape_encode(text)[0]
    #print type(text)

    # Tokenize
    tokens = nltk.word_tokenize(text)
    #for token in tokens: print token

    # Done!
    return text, tokens

コード例 #10

0

ファイルを表示

ファイル: unicode_escape.py プロジェクト: emilyemorehouse/ast-and-me

 def encode(self, input, final=False):
     return codecs.unicode_escape_encode(input, self.errors)[0]

コード例 #11

0

ファイルを表示

ファイル: nodes.py プロジェクト: xxoolm/Ryven

 def update_event(self, inp=-1):
     self.set_output_val(
         0, codecs.unicode_escape_encode(self.input(0), self.input(1)))

コード例 #12

0

ファイルを表示

ファイル: isis.py プロジェクト: Humans-Huddle/json-chaos

import urllib2
import itertools
import codecs


###     extract tweets from archive     ###

json_data=open('Human-Security.json')
data = json.load(json_data)

#store tweet text as CSV
outfile = 'tweets'".csv"
myfile = codecs.open(outfile, "wb", 'utf-8')
w = csv.writer(myfile)
for i in range(len(data['statuses'])):
    w.writerow(codecs.unicode_escape_encode(data['statuses'][i]['text']))
myfile.close()



###     Most Mentioned users    ###

#find all @mentions in tweet text - and store as csv
outfile = 'mentions1'".csv"
myfile = codecs.open(outfile, "wb", 'utf-8')
w = csv.writer(myfile)
input = codecs.open('tweets.csv', "r", 'utf-8')
text = input.read()
regex = "(?=(@[\w]+))"
for result in re.finditer(regex, text):
    w.writerow(codecs.unicode_escape_encode("".join(result.groups())))

コード例 #13

0

ファイルを表示

ファイル: util.py プロジェクト: brettpavia/artisan

def encodeLocal(x):
    if x is not None:
        return codecs.unicode_escape_encode(str(x))[0].decode("utf8")
    else:
        return None

コード例 #14

0

ファイルを表示

ファイル: test_codecs.py プロジェクト: vishalbelsare/graalpython

 def test_empty(self):
     self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
     self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))

コード例 #15

0

ファイルを表示

 def transition2(self, item):
     item = codecs.unicode_escape_encode(item[0])
     item = codecs.escape_decode(item[0])
     return item[0]

コード例 #16

0

ファイルを表示

ファイル: utils.py プロジェクト: Hinge/bulbs

def u(x):
    byte_string, length = codecs.unicode_escape_encode(x)
    unicode_string, length = codecs.unicode_escape_decode(byte_string)
    return unicode_string

コード例 #17

0

ファイルを表示

ファイル: isis.py プロジェクト: Humans-Huddle/json-chaos

import json
import urllib2
import itertools
import codecs

###     extract tweets from archive     ###

json_data = open('Human-Security.json')
data = json.load(json_data)

#store tweet text as CSV
outfile = 'tweets' ".csv"
myfile = codecs.open(outfile, "wb", 'utf-8')
w = csv.writer(myfile)
for i in range(len(data['statuses'])):
    w.writerow(codecs.unicode_escape_encode(data['statuses'][i]['text']))
myfile.close()

###     Most Mentioned users    ###

#find all @mentions in tweet text - and store as csv
outfile = 'mentions1' ".csv"
myfile = codecs.open(outfile, "wb", 'utf-8')
w = csv.writer(myfile)
input = codecs.open('tweets.csv', "r", 'utf-8')
text = input.read()
regex = "(?=(@[\w]+))"
for result in re.finditer(regex, text):
    w.writerow(codecs.unicode_escape_encode("".join(result.groups())))
myfile.close()

コード例 #18

0

ファイルを表示

ファイル: unicode_escape.py プロジェクト: webiumsk/WOT-0.9.15-CT

 def encode(self, input, final = False):
     return codecs.unicode_escape_encode(input, self.errors)[0]

コード例 #19

0

ファイルを表示

ファイル: utils.py プロジェクト: ct2034/bulbs

def u(x):
    byte_string, length = codecs.unicode_escape_encode(x)
    unicode_string, length = codecs.unicode_escape_decode(byte_string)
    return unicode_string

コード例 #20

0

ファイルを表示

ファイル: isis_update.py プロジェクト: Humans-Huddle/json-chaos

outfile = 'tweets_nonUTF'".csv"
myfile = codecs.open(outfile, "a",)
w = csv.writer(myfile)
for i in range(len(data['statuses'])):
    w.writerow(codecs.unicode_escape_encode(data['statuses'][i]['text']))
myfile.close()

'''
outfile = 'linkslist'".csv"
myfile = codecs.open(outfile, "wb", 'utf-8')
w = csv.writer(myfile)
input = codecs.open('tweets_nonUTF.csv', "r", 'utf-8')
text = input.read()
links_regex = "(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)"
for result in re.finditer(links_regex, text):
    w.writerow(codecs.unicode_escape_encode("".join(result.groups())))
myfile.close()


#Open list of Links
input = codecs.open('linkslist.csv', "r", 'utf-8')
text = input.read()
wordlist = text.split()


#frequency of tags used
outfile = 'links_frequency'".csv"
myfile = codecs.open(outfile, "wb", 'utf-8')
w = csv.writer(myfile)

wordfreq = [wordlist.count(p) for p in wordlist]

コード例 #21

0

ファイルを表示

ファイル: isis_update.py プロジェクト: Humans-Huddle/json-chaos

outfile = 'tweets_nonUTF'".csv"
myfile = codecs.open(outfile, "a",)
w = csv.writer(myfile)
for i in range(len(data['statuses'])):
    w.writerow(codecs.unicode_escape_encode(data['statuses'][i]['text']))
myfile.close()

'''
outfile = 'linkslist' ".csv"
myfile = codecs.open(outfile, "wb", 'utf-8')
w = csv.writer(myfile)
input = codecs.open('tweets_nonUTF.csv', "r", 'utf-8')
text = input.read()
links_regex = "(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)"
for result in re.finditer(links_regex, text):
    w.writerow(codecs.unicode_escape_encode("".join(result.groups())))
myfile.close()

#Open list of Links
input = codecs.open('linkslist.csv', "r", 'utf-8')
text = input.read()
wordlist = text.split()

#frequency of tags used
outfile = 'links_frequency' ".csv"
myfile = codecs.open(outfile, "wb", 'utf-8')
w = csv.writer(myfile)

wordfreq = [wordlist.count(p) for p in wordlist]
dictionary = dict(zip(wordlist, wordfreq))
aux = [(dictionary[key], key) for key in dictionary]

コード例 #22

0

ファイルを表示

    def start_requests(self):

        try:
            # 因为使用phantomjs登陆非常麻烦，所有直接使用selenium进行有界面的环境进行爬取，登录步骤为在用户名和密码进行填写然后点击登录
            browser = webdriver.Chrome()
            browser.get('http://weibo.com/')
            sleep(5)
            # 对用户名和密码的输入框进行填充，然后点击登录按钮
            browser.find_element_by_id('loginname').clear()
            browser.find_element_by_id('loginname').send_keys(
                "*****@*****.**")
            browser.find_element_by_name('password').clear()
            browser.find_element_by_name('password').send_keys("465212")
            browser.find_element_by_xpath(
                '//*[@id="pl_login_form"]/div/div[3]/div[6]/a').send_keys(
                    Keys.ENTER)
            sleep(10)
            # page需要区分是不是第一页，因为第一页的结构与其他页不同
            divTemp = 0
            start = 0
            end = 0
            for page in range(1, 50):
                # 搜索的关键词 页数
                browser.get('http://s.weibo.com/weibo/摩拜&page=' + str(page))
                sleep(5)
                self.fi.write(str(page) + '\n')
                if page == 1:
                    divTemp = 3
                    start = 1
                    end = 19
                else:
                    divTemp = 1
                    start = 2
                    end = 20

                for item in range(start, end):
                    # text为正文信息，因为可能文字过长存在展开全文选项，需要特别处理
                    # 如果存在链接的话，会出现在该文字下的p标签中
                    try:
                        textLink = browser.find_element_by_xpath(
                            '//*[@id="pl_weibo_direct"]/div/div[' +
                            str(divTemp) + ']/div[' + str(item) +
                            ']/div/div[1]/dl/div/div[3]/div[1]/p/a[1]')
                    except:
                        textLink = 0
                    # 如果p标签下没有该链接选项，则直接获取该标签下的文字，有些字符会出现错误，直接replace掉
                    if textLink == 0:
                        text = browser.find_elements_by_xpath(
                            '//*[@id="pl_weibo_direct"]/div/div[' +
                            str(divTemp) + ']/div[' + str(item) +
                            ']/div/div[1]/dl/div/div[3]/div[1]/p')
                        text = text[0].text.replace(u"#", "").replace(
                            u" ", "").replace(u'&#8203;',
                                              "").replace(u'&#nbsp;', "")
                    # 存在链接，但是可能又好几个，有的是各种活动的超级话题，如“#天气#”，所以通过遍历的方法来进行筛选
                    else:
                        for j in range(2, 10):
                            try:
                                # 如果没有action-data，则不断循环，有之后则拿到退出
                                textLink = textLink.get_attribute(
                                    "action-data")
                                if textLink is None:
                                    textLink = browser.find_element_by_xpath(
                                        '//*[@id="pl_weibo_direct"]/div/div[' +
                                        str(divTemp) + ']/div[' + str(item) +
                                        ']/div/div[1]/dl/div/div[3]/div[1]/p/a['
                                        + str(j) + ']')
                                else:
                                    break
                            except:
                                pass
                        # 如果为空取原text值，并做处理
                        if textLink is None:
                            text = browser.find_elements_by_xpath(
                                '//*[@id="pl_weibo_direct"]/div/div[' +
                                str(divTemp) + ']/div[' + str(item) +
                                ']/div/div[1]/dl/div/div[3]/div[1]/p')
                            text = text[0].text.replace(u"#", "").replace(
                                u" ", "").replace(u'&#8203;',
                                                  "").replace(u'&#nbsp;', "")
                        # 如果不为空，通过分析网页和请求，获取到链接拼接的字符串并进行拼接，拼接后的url中就是全文信息，请求获得即可
                        else:
                            textLink = "http://s.weibo.com/ajax/direct/morethan140?" + textLink
                            try:
                                resultTemp = urllib2.urlopen(textLink)
                                jsonInfo = resultTemp.read()
                                jsonFile = json.loads(jsonInfo)
                                text = jsonFile['data']['html'].replace(
                                    u"#", "").replace(u" ", "").replace(
                                        u'&#8203;',
                                        "").replace(u'&#nbsp;', "")
                            # 通过尝试，发现新浪有时候自己的链接会为空，手动点击展示全文也无效，这样的话直接取不展开的内容
                            except:
                                try:
                                    text = browser.find_elements_by_xpath(
                                        '//*[@id="pl_weibo_direct"]/div/div[' +
                                        str(divTemp) + ']/div[' + str(item) +
                                        ']/div/div[1]/dl/div/div[3]/div[1]/p')
                                    text = text[0].text.replace(
                                        u"#", "").replace(u" ", "").replace(
                                            u'&#8203;',
                                            "").replace(u'&#nbsp;', "")
                                except:
                                    continue
                    text = codecs.escape_decode(
                        codecs.unicode_escape_encode(text)[0])[0]
                    pattern = re.compile(r'\\U.{8}', re.U)
                    text = re.sub(pattern, '', str(text))
                    text = codecs.unicode_escape_decode(text)[0]
                    # 获取位置信息(可能没有)
                    try:
                        location = browser.find_elements_by_xpath(
                            '//*[@id="pl_weibo_direct"]/div/div[' +
                            str(divTemp) + ']/div/div[' + str(item) +
                            ']/div/div[1]/dl/div/div[3]/div[1]/p')
                        location = location.find_element_by_class_name(
                            "W_btn_c6")
                        location = location.text.replace(u"&#8203;",
                                                         "").replace(u"|", "")
                    except:
                        location = 0
                    # 获取用户名
                    userName = browser.find_element_by_xpath(
                        '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) +
                        ']/div[' + str(item) +
                        ']/div/div[1]/dl/div/div[3]/div[1]/a[1]')
                    userName = userName.text
                    # 获取用户头像
                    userHead = browser.find_element_by_xpath(
                        '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) +
                        ']/div[' + str(item) +
                        ']/div/div[1]/dl/div/div[2]/a/img')
                    userHead = userHead.get_attribute("src")
                    # 获取发送时间
                    sendTime = browser.find_element_by_xpath(
                        '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) +
                        ']/div[' + str(item) +
                        ']/div/div[1]/dl/div/div[3]/div[2]/a[1]')
                    sendTime = sendTime.text
                    dt = datetime.datetime.strptime(
                        time.strftime('%Y-%m-%d %H:%M:%S',
                                      time.localtime(time.time())),
                        "%Y-%m-%d %H:%M:%S")
                    # 格式化时间信息，因为有xx秒前、xx分钟前、今天xxx等
                    if u"秒" in sendTime:
                        sendTime = dt + datetime.timedelta(
                            seconds=-int(sendTime.split(u"秒")[0]))
                        sendTime = sendTime.strftime("%Y-%m-%d %H:%M:%S")
                    elif u"分钟" in sendTime:
                        sendTime = dt + datetime.timedelta(
                            minutes=-int(sendTime.split(u"分钟")[0]))
                        sendTime = sendTime.strftime("%Y-%m-%d %H:%M:%S")
                    elif u"今天" in sendTime:
                        sendTime = time.strftime(
                            '%Y-%m-%d', time.localtime(time.time(
                            ))) + " " + sendTime.split(u"今天")[1] + ":00"
                    # 转换为标准时间
                    else:
                        if len(sendTime.split(u"月")[0]) == 1:
                            if len(sendTime.split(u"月")[1].split(u"日")
                                   [0]) == 1:
                                sendTime = time.strftime('%Y', time.localtime(time.time())) + "-0" + \
                                           sendTime.split(u"月")[0] + "-0" + sendTime.split(u"月")[1].split(u"日")[
                                               0] + " " + sendTime.split(u"日")[1] + ":00"
                            else:
                                sendTime = time.strftime('%Y', time.localtime(time.time())) + "-0" + \
                                           sendTime.split(u"月")[0] + "-" + sendTime.split(u"月")[1].split(u"日")[
                                               0] + " " + sendTime.split(u"日")[1] + ":00"
                        else:
                            if len(sendTime.split(u"月")[1].split(u"日")
                                   [0]) == 1:
                                sendTime = time.strftime('%Y', time.localtime(time.time())) + "-" + \
                                           sendTime.split(u"月")[0] + "-0" + sendTime.split(u"月")[1].split(u"日")[
                                               0] + " " + sendTime.split(u"日")[1] + ":00"
                            else:
                                sendTime = time.strftime('%Y', time.localtime(time.time())) + "-" + \
                                           sendTime.split(u"月")[0] + "-" + sendTime.split(u"月")[1].split(u"日")[
                                               0] + " " + sendTime.split(u"日")[1] + ":00"
                    #使用什么终端发送的微博
                    try:
                        sendFrom = browser.find_element_by_xpath(
                            '//*[@id="pl_weibo_direct"]/div/div[' +
                            str(divTemp) + ']/div[' + str(item) +
                            ']/div/div[1]/dl/div/div[3]/div[2]/a[2]')
                        sendFrom = sendFrom.text
                    except:
                        sendFrom = u"无"
                    # 分享、评论、点赞数(三个数字的结构不一样)
                    shareCount = browser.find_element_by_xpath(
                        '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) +
                        ']/div[' + str(item) +
                        ']/div/div[2]/ul/li[2]/a/span/em')
                    shareCount = shareCount.text
                    if len(shareCount) == 0:
                        shareCount = "0"
                    try:
                        commondCount = browser.find_element_by_xpath(
                            '//*[@id="pl_weibo_direct"]/div/div[' +
                            str(divTemp) + ']/div[' + str(item) +
                            ']/div/div[2]/ul/li[3]/a/span/em')
                        commondCount = commondCount.text
                    except:
                        commondCount = "0"
                    goodCount = browser.find_element_by_xpath(
                        '//*[@id="pl_weibo_direct"]/div/div[' + str(divTemp) +
                        ']/div[' + str(item) +
                        ']/div/div[2]/ul/li[4]/a/span/em')
                    goodCount = goodCount.text
                    if len(goodCount) == 0:
                        goodCount = "0"
                    # 插入数据库
                    try:
                        if location == 0:
                            self.coll_weiboHighTemperature.insert({
                                "userName":
                                userName,
                                "userHead":
                                userHead,
                                "text":
                                text,
                                "sendTime":
                                sendTime,
                                "sendFrom":
                                sendFrom,
                                "shareCount":
                                shareCount,
                                "commondCount":
                                commondCount,
                                "goodCount":
                                goodCount
                            })
                            self.fi2.write(userName + '\n' + userHead + '\n' +
                                           text + '\n' + sendTime + '\n' +
                                           sendFrom + '\n' + shareCount +
                                           '\n' + commondCount + '\n' +
                                           goodCount + '\n')
                        else:
                            self.coll_weiboHighTemperature.insert({
                                "userName":
                                userName,
                                "userHead":
                                userHead,
                                "text":
                                text,
                                "sendTime":
                                sendTime,
                                "sendFrom":
                                sendFrom,
                                "shareCount":
                                shareCount,
                                "commondCount":
                                commondCount,
                                "goodCount":
                                goodCount,
                                "location":
                                location
                            })
                            self.fi2.write(userName + '\n' + userHead + '\n' +
                                           text + '\n' + sendTime + '\n' +
                                           sendFrom + '\n' + shareCount +
                                           '\n' + commondCount + '\n' +
                                           goodCount + '\n' + location + '\n')
                        print str(page) + str(item)
                    except:
                        # 特殊字符多为手机自带的特殊字符
                        print str(page) + str(item)
                sleep(5)
        except BaseException, e:
            self.logger.error("ERROR : start  " + repr(e))
            self.close(self.name, repr(e))

コード例 #23

0

ファイルを表示

            # Skip if the word is not unique
            if word not in words_unique:
                continue

            # Retrieve more words from wikipedia
            try:
                more_words = get_wiki_words(word)
                print "Retrieval successful for", word
            except Exception:
                print "Retrieval failed for", word
                continue

            # Get rid of unicode issues
            more_words = [
                codecs.unicode_escape_encode(w)[0] for w in more_words
            ]

            # Clean new words by means of clean_words() from extract_words.py
            more_words = clean_words(more_words)

            # Keep the words
            all_new_words = all_new_words + more_words

        # Add new words to the words_dict
        words_dict[name] = sorted(words_dict[name] + all_new_words)

        # Report
        print "### RETRIEVAL COMPLETE FOR GROUP", group_number + 1, 'OF', len(
            words_dict), '(' + name + ')\n'