Beispiel #1
0
def writeCityName():
    if not os.path.exists('cityName.csv'):
        url = "http://www.zxinc.org/gb2260.htm"
        print 'start reading ...'
        response = urllib.urlopen(url)
        page = response.read()
        page = page.decode('utf8')
        print 'reading done...'
        pattern = re.compile(ur'([\u4e00-\u9fa5]{2,5}市)')
        match = pattern.findall(page)
        if match:
            try:
                with open('cityName.csv', 'wb') as csvfile:
                    csvWrite = csv.writer(csvfile,
                                          delimiter=' ',
                                          quotechar='|',
                                          quoting=csv.QUOTE_MINIMAL)
                    csvfile.write(codecs.BOM_UTF8)
                    test = PinYin()
                    test.load_word()
                    for result in match:
                        result = result.encode('utf8')
                        py = test.hanzi2pinyin(string=result[:-3])
                        csvWrite.writerow([result[:-3], py[-1]])
                print 'write done!'
            except Exception as e:
                print e
            finally:
                csvfile.close()
    else:
        print 'cityName.csv detected'
Beispiel #2
0
 def __init__(self):
     self.pp = PinYin()
     self.pp.load_word()
     with open('pinyin_dict', 'r') as ff:
         line = ff.readline()
         self.jj_dict = json.loads(line)
         ff.close()
Beispiel #3
0
def get_pinyin_data():
    global gPinYin
    if gPinYin is None:
        pydatapath = 'pinyin_word.data'
        gPinYin = PinYin(pydatapath)
        gPinYin.load_word()
    return gPinYin
def get_item(marc_no, status=0):
    dict = {}
    test = PinYin()
    test.load_word('word.data')
    hm = requests.get(ourl + 'item.php?marc_no=' + str(marc_no)).text.encode(
        encoder).decode('utf8').replace(' ', '')
    parser = HTMLParser.HTMLParser()
    s1 = parser.unescape(hm)
    static = re.findall('<div id="book_info">(.*?)<div class="clear"></div>',
                        s1, re.S)[0]
    booklist = re.findall('<dl class="booklist">(.*?)</dl>', static, re.S)
    for each in booklist:
        pm = re.findall('<dt>(.*?)</dt>', each, re.S)[0]
        if pm == '':
            continue
        st = re.findall('<dd>(.*?)</dd>', each, re.S)[0]
        try:
            st1 = re.findall('>(.*?)</a>', st, re.S)[0]
        except:
            st1 = st
        pms = test.hanzi2pinyin_split(string=pm, split="",
                                      firstcode=True).replace('/', '')
        dict[pms] = st1
        if status == 1:
            print pm,
            print st1
    return dict
Beispiel #5
0
 def __init__(self):
     self.pa=Parser()
     self.pp=PinYin()
     self.pp.load_word()
     with open(os.path.join(os.path.dirname(__file__),'pinyin_dict'),'r') as ff:
         line=ff.readline()
         self.jj_dict=json.loads(line)
         ff.close()
Beispiel #6
0
    def get_authors_by_venue(cached_list, cached_set, cdblp_venue, dblp_venue):

        d = DBLPQuery.get_cache('cdblp-pub-cache.data')

        if not d.__contains__(cdblp_venue.get('title')):
            print('This C-DBLP venue is not on file.')
            return

        res = urlopen('http://www.dblp.org/search/api/?q=ce:venue:{}:*&h=750&format=json'.format(dblp_venue.get('title').lower()))
        # fix titles as { "Title ..." }
        fixed_json = re.compile('({\s*)(".+")(\s*})').sub(lambda m: m.group(2), res.read().decode('utf-8'))

        # get publications
        cdblp_pubs = d.get(cdblp_venue.get('title'))
        dblp_pubs = json.loads(fixed_json)

        cdblp_authors = set()
        dblp_authors = set()
        authors = dict()

        #print(type(cdblp_pubs))
        #print(cdblp_pubs.keys())

        for ky in cdblp_pubs.keys():
            for ki in cdblp_pubs.get(ky).keys():
                for pub in cdblp_pubs.get(ky).get(ki):
                    for author in pub.get('authors'):
                        cdblp_authors.add(author)

        for pub in dblp_pubs.get('result').get('hits').get('hit'):
            try:
                for author in pub.get('info').get('authors').get('author'):
                    dblp_authors.add(author)
            except AttributeError:
                print('PublicationException: %s' % pub.get('@id'))

        pinyin = PinYin()
        pinyin.load_word()

        for author in cdblp_authors:
            name_comp = CDBLPAuthor.get_english_name(author, pinyin)
            if name_comp['full_name'] in dblp_authors:
                if authors.__contains__(name_comp['full_name']):
                    authors[name_comp['full_name']]['zh'] = name_comp['zh']
                    authors[name_comp['full_name']]['count'] += 1
                else:
                    authors[name_comp['full_name']] = { 'zh': name_comp['zh'], 'count': 1 }
            elif len(author) == 3 and authors.__contains__(name_comp['full_name_dash']):
                if authors.__contains__(name_comp['full_name_dash']):
                    authors[name_comp['full_name_dash']]['zh'] = name_comp['zh']
                    authors[name_comp['full_name_dash']]['count'] += 1
                else:
                    authors[name_comp['full_name_dash']] = { 'zh': name_comp['zh'], 'count': 1 }

        return authors
Beispiel #7
0
def t2():
    test = PinYin()
    test.load_word()
    #string = u"Kottlers古玩城"
    #string = u"Head 2 Toe发型店"
    #string = u"蓝"
    #print string
    #print test.hanzi2pinyin(string=string)
    #print Cartesian_product(test.hanzi2pinyin(string=string))

    name = u"普季(商城)"
    name = u"Kottlers古玩城"
    name = u"hello 艾压(重庆店)山"
    name = u"库兰达(库兰达热带雨林)"
    #name = u"盛文甘hello店(店)"
    #name = u"义乌三期市场(原篁园市场)"
    print name
    p = re.compile(u'[\u4e00-\u9fa5]+')
    p_eng = re.compile(u'[a-zA-Z]+')
    j = 0
    strs = []
    while (j < len(name)):

        #for j in xrange(len(name)):
        #    if j

        if j + 1 == len(name):
            strs.append(name[j])
        else:
            print(name[j], name[j + 1]), is_hz_py(name[j], name[j + 1])
            if not is_hz_py(name[j], name[j + 1]):
                print name[j], j
                strs.append(name[j] + u" ")
            else:
                strs.append(name[j])
        j += 1
    name = "".join(strs)
    ch_names = p.findall(name)
    tmp = name
    ll = []
    mydict = {}
    cnames = "".join([ch_name for ch_name in ch_names])
    #pys = test.hanzi2pinyin(string=cnames)
    pys = Cartesian_product(test.hanzi2pinyin(string=cnames))
    print cnames, pys, ch_names
    for p in pys:
        tmp2 = name
        for ch_name in ch_names:
            m = re.search(ch_name, cnames)
            _start = m.start()
            _end = m.end()
            replace = " ".join([k for k in p.split()[_start:_end]])
            print _start, _end, replace, tmp2
            tmp2 = re.sub(ch_name, replace, tmp2, 1)
        print tmp2
Beispiel #8
0
 def getciyun(self):
     # 得到词云回答者信息
     test1 = PinYin()
     test1.load_word()
     str1 = str(test1.hanzi2pinyin_split(string=str(self.aa.topic), split="-"))
     path1 =  'F:/zhihu/answer/people_qb.txt'
     cloud.ciyun1(path1,str1+'people')
     #得到词云,问题信息
     path2='F:/zhihu/answer/question_top10.txt'
     cloud.ciyun1(path2,str1+'question')
     path2 =  'F:/zhihu/answer/p_location.txt'
     cloud.ciyun1(path2,str1+'slocation')
Beispiel #9
0
def name_tran(str):
    test = PinYin()
    test.load_word()
    str[0]
    family = test.hanzi2pinyin(string=str[0])[0]
    last = u''
    print(str[1:])
    for word in test.hanzi2pinyin(string=str[1:]):
        last = last + word

    name_en = last.title() + u' ' + family.title()
    return name_en
Beispiel #10
0
 def __init__(self):
     self.pinYinRobot = PinYin()
     self.pinYinRobot.load_word()
     self.shengMu = [
         "b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q",
         "x", "zh", "ch", "sh", "r", "z", "c", "s", "y", "w"
     ]
     self.zhengTi = [
         "zhi", "chi", "shi", "ri", "zi", "ci", "si", "yu", "ye", "yue",
         "yuan", "yin", "yun", "ying"
     ]
     print("pinYinRobot is loaded")
Beispiel #11
0
    def _generate_name(self):
        if not self.name and not self.email:
            return []
        result = []

        # true name
        pinyin = PinYin(PINYIN)
        pinyin.load_word()
        name_pinyin_list = map(pinyin.hanzi2pinyin, self.name)
        result.extend(self._format(name_pinyin_list, built_in.name_formats))
        result.extend(self._format(self.username, built_in.general_formats))
        result.extend(self._generate_email())
        return list(set(result))
Beispiel #12
0
def draw_frame(faces, img, gray, move):

    global xdeg
    global ydeg
    global fps
    global time_t

    if move == 2:
        steering_control(faces, img)
    # Draw a rectangle around every face
    for (x, y, w, h) in faces:

        cv2.rectangle(img, (x, y), (x + w, y + h), (200, 255, 0), 2)
        #-----rec-face
        roi = gray[x:x + w, y:y + h]
        try:
            roi = cv2.resize(roi, (200, 200), interpolation=cv2.INTER_LINEAR)
            params = model.predict(roi)
            if params[1] < 500.0:
                #print (names[params[0]])
                #pec = (' %.2f' % (params[1]))
                #sign = names[params[0]] + pec
                pyin = PinYin()
                pyin.load_word()
                pname = names[params[0]]
                change_identity(pname)
                #pyin.hanzi2pinyin(string = pname)
                pname = pyin.hanzi2pinyin_split(string=pname, split='')
                s = ''
                for p in pname:
                    s = s + p
                sign = ("%s %.2f" % (s, params[1]))
                # print(sign)
                cv2.putText(img, sign, (x, y - 2), cv2.FONT_HERSHEY_SIMPLEX,
                            0.5, (0, 0, 255), 2)
                #img = cv2ImgAddText(img, sign , x , y - 2, (0, 0, 255), 20)
                #img = change_cv2_draw(img,sign,(x, y + 2), 20 , 'firebrick' )

        except:
            continue

    # Calculate and show the FPS
    fps = fps + 1
    sfps = fps / (time.time() - t_start)
    cv2.putText(img, "FPS : " + str(int(sfps)), (10, 15),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    cv2.imshow("recognize-face", img)
Beispiel #13
0
    def get_sample_users():
        cache = open('author-cache.data', 'w')
        piy = PinYin()
        piy.load_word()
        author_list = []
        res = urlopen('http://easyscholar.ruc.edu.cn/moreuser.html')
        dom = BeautifulSoup(res)
        author_tags = dom.find_all(href=re.compile('^homepage/'))
        for author_tag in author_tags:
            if author_tag.findChild('strong'):
                #print(author_tag.findChild('strong').contents)
                author_name = CDBLPAuthor.getEnglishName(author_tag.findChild('strong').contents[0])
                author_list.append(author_name)
                #print('{} {}'.format(author_name['full_name'], author_name['zh']))
                #print(CDBLPAuthor.getEnglishName(author_tag.findChild('strong').contents[0])['full_name'])
                #print(piy.hanzi2pinyin(author_tag.findChild('strong').contents[0]))

        cache.write(json.dumps(author_list))
        cache.close()
        return author_list
Beispiel #14
0
def idiomFind(x):
    if x == None:
        raise Exception
    else:
        with open('idiom.txt','r') as f:
            base = f.readlines()
            random.shuffle(base)
            j = 0
            for i in base:
                
                c = i[:3].decode('utf8')
                if len(i)>1:
                    try:
                        test = PinYin()
                        test.load_word()
                        py = test.hanzi2pinyin(c)[0]
                        if (py == x):
                            return i
                    except:
                        continue
        return None
Beispiel #15
0
def main(args):

    test = PinYin()
    test.load_word()

    conn = getconn()
    cursor = conn.cursor()
    cursor.execute('select rname,rid from roominfo where py_name is null')
    #    cursor.execute('select cname,area from area_name_map where py_name is null')
    rows = cursor.fetchall()

    for row in rows:
        myword = row[0].encode("utf8")
        pylist = test.hanzi2pinyin(string=myword)
        pystr = pylist[0]
        for w in pylist[1:]:
            pystr = pystr + w[0]
#        cursor.execute('update area_name_map set py_name=? where cname=? and area=?',(pystr,row[0],row[1]))
        cursor.execute('update roominfo set py_name=? where rid=?',
                       (pystr, row[1]))
        conn.commit()
    conn.close()
Beispiel #16
0
    def Convert(self):
        py_engine = PinYin()
        py_engine.load_word()

        contact = list()
        f = open(self.filename,'r')
        for line in open(self.filename):  
            line = f.readline()
            k = re.findall(r"(\N\:[^\;]*\;[^\;]*\;[^\;]*\;[^\;]*\;)", line) 
            if k:
                if k[0].find(';') - 2 > 3:
                    xing = k[0][2: 5]
                    ming = k[0][5: k[0].find(';')] + k[0][k[0].find(';') + 1 : k[0].find(';', k[0].find(';')+1)]
                else:
                    xing = k[0][2: k[0].find(';') ]
                    ming = k[0][k[0].find(';') + 1 : k[0].find(';', k[0].find(';')+1)]
                contact.append('N:'+xing+';'+ming+';'+";;\n")

                phones = py_engine.hanzi2pinyin(string=xing)
                line = "X-PHONETIC-LAST-NAME:"
                for item in phones:
                    if item != '':
                        line = line + item.capitalize()
                line += "\n"
                contact.append(line)

                phones = py_engine.hanzi2pinyin(string=ming)
                line = "X-PHONETIC-FIRST-NAME:"
                for item in phones:
                    if item != '':
                        line = line + item.capitalize()
                line += "\n"
                contact.append(line)
            else:
                contact.append(line)

        fout = open("ok_"+self.filename, 'w')
        for line in contact:
            fout.write(line)
Beispiel #17
0
                host='192.100.2.31',
                user='******',
                passwd='opensesame',
                db='traincrawler',
                port=3306)
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

base_url = "http://trains.ctrip.com/TrainBooking/Ajax/GetTrainDataV2.aspx?DepartureCity=%s&ArrivalCity=%s&DepartureDate=2017-03-30&NO=01"
# post_param = 'http://trains.ctrip.com/TrainBooking/Ajax/SearchListHandler.ashx?Action=getSearchList&value={"IsBus": False, "Filter": "0", "Catalog": "", "IsGaoTie":False, "IsDongChe":False, "CatalogName": "", "DepartureCity": %s, "ArrivalCity": %s, "HubCity": "", "DepartureCityName": %s, "ArrivalCityName": %s, "DepartureDate": "2017-03-24", "DepartureDateReturn": "2017-03-26", "ArrivalDate": "", "TrainNumber": ""}'
base_path = 'xc-price/%s'
getStations_sql = 'select id,begin_stop,begin_alia,end_stop,end_alia from train_stop_20170331_task_xc where task=0 limit 100'
update_sql = 'update train_stop_20170331_task_xc set task = 1 where id =%s'
py_util = PinYin()
py_util.load_word('word.data')


def get(p):
    time.sleep(1)
    content = ''
    try:
        p = p.encode('utf-8')
        response = urllib.urlopen(p)
        content = response.read()
        response.close()
        return content.decode('gb2312')
    except Exception as e:
        print e
        content = '500'
# -*- coding: utf-8 -*-
# Author: [email protected]
# Copyright 2015 @ NLPJob

#bug fixed : can be use with pyenv environment
#update: this is Python3 script

import codecs
import sys

from langconv import *
from pinyin import PinYin
py = PinYin()
py.load_word()


def make_word_4tag(word):
    if len(word) == 0:
        return "N"
    if len(word) == 1:
        return "S"
    else:
        tag = "B"
        for w in word[1:len(word) - 1]:
            tag += "M"
        tag += "E"
        return tag


def make_mecab_train_data(input_file, output_file):
    input_data = codecs.open(input_file, 'r', 'utf-8')
Beispiel #19
0
from common import get_response_by_url
from mongoservice import Insert,get_category_by_cid,get_by_pinyin,get_all
from bs4 import BeautifulSoup
from pinyin import PinYin
import  os
_cid = 160
base_url = "http://www.meishij.net/shiliao.php?cid="
s_pinyin = PinYin()
s_pinyin.load_word()

# filepath =os.path.abspath("./1.json")
'''获取 理疗分类'''
def get_meishijie_categories(cid,category_pinyin='',category_cn=''):
    url=base_url+str(cid)
    html =get_html_by_url(url)
    # print(html)
    # soup = BeautifulSoup(html)
    # print(soup)
    # print(soup.prettify())

    sop = BeautifulSoup(html)
    # h = sop.prettify()
    # print( h )
    # head = sop.find('head')
    # print(head)
    # p_categories = sop.findAll(attrs={'id':'listnav_ul'})[0]
    # print(p_categories)

    # dds = sop.select(".listnav_dl_style1 dd a")
    dds = sop.select(".listnav_dl_style1 .current a")
Beispiel #20
0
class CDBLPAuthor:

    pinyin = PinYin()
    pinyin.load_word()

    def __init__(self, author_name, link=''):

        self.author_name = CDBLPAuthor.getEnglishName(author_name)

        if not link:
            link = 'http://cdblp.cn/search_result.php?author_name={}&area=computer'.format(
                quote(self.author_name['zh']))
        elif author_name == '王伟':
            link = 'http://127.0.0.1/ww'

        self.res = urlopen(link)
        self.dom = BeautifulSoup(self.res)

        #self.get_all_authors()

        self.author = {
            'author_name': {},
            'coauthors': [],
            'publications': [{
                'title':
                'Ranking the Difficulty Level of the Knowledge Units Based on Learning Dependency',
                'authors':
                ['Jun Liu', 'Sha Sha', 'Qinghua Zheng', 'Wei Zhang'],
                'venue-type':
                'journal',
                'venue':
                'IJDET',
                'volume':
                '',
                'number':
                '',
                'pages':
                '',
                'year':
                '2012',
                'cdblpkey':
                '83594'
            }]
        }

    def get_all_authors(self):

        l = []

        all_name_tags = self.dom.find_all(
            href=re.compile('namedisambiguation'))

        i = 0
        for name_tag in all_name_tags:
            if name_tag.string != 'Unknown':
                print(i, self.author_name['zh'], 'from', name_tag.string)
                l.append('http://cdblp.cn' + name_tag['href'][5:])
            i += 1

        c = int(
            input(
                'There are several authors under this name, which one do you want to choose?\n> '
            ))
        if c < 0:
            c = 0

        self.res = urlopen(l[c])
        self.dom = BeautifulSoup(self.res)

        return l[c]

    def get_author(self):

        coauthors = self.get_coauthors()
        publications = []

        paper_link_tags = self.dom.find_all(href=re.compile('^/paper'))

        for paper_link_tag in paper_link_tags:
            # table cell tag
            td_tag = paper_link_tag.parent
            # title
            title = paper_link_tag.string
            link = paper_link_tag['href']
            cdblbkey = re.findall('(\d+)(\.html$)', link)[0][0]
            # authors
            authors = []
            counter = 0
            for author_tag in td_tag.find_all(href=re.compile('^/author')):
                if counter == 0:
                    current_author = author_tag.previous_sibling
                    if type(current_author
                            ) == NavigableString and self.author_name[
                                'zh'] in current_author.string:
                        authors.append(current_author.string.strip())

                if isinstance(author_tag.string, str):
                    authors.append(author_tag.string.strip())

                current_author = author_tag.next_sibling
                if type(current_author
                        ) == NavigableString and self.author_name[
                            'zh'] in current_author.string:
                    authors.append(
                        current_author.string.replace('.', '').strip())

                counter += 1

            # publication data
            venue_rec = td_tag.find_all(href=re.compile('^/journal'))
            venue = venue_rec[0].string
            volume_result = re.compile('(/journal)/(.*)/(\d*)/(.*)').findall(
                venue_rec[1]['href'])[0]
            issue_result = re.compile(
                '(/journal_issue)/(.*)/(\d*)/(.*)').findall(
                    venue_rec[2]['href'])[0]
            year = volume_result[-2]
            volume = volume_result[-1]
            number = issue_result[-1]
            pages = venue_rec[-1].next_sibling.string.replace(':', '').strip()

            publication = {
                'title': title,
                'authors': authors,
                'venue-type': 'journal',
                'venue': venue,
                'volume': unquote(volume),
                'number': unquote(number),
                'pages': pages,
                'year': year,
                'cdblpkey': cdblbkey
            }

            publications.append(publication)

        self.author = {
            'author_name': self.author_name,
            'coauthors': coauthors,
            'publications': publications
        }

        return self.author

    def get_coauthors(self):
        coauthors = []

        coauthor_table = self.dom.find_all('table')[-2]
        coauthor_tags = coauthor_table.find_all(href=re.compile('^/author'))
        for coauthor_tag in coauthor_tags:
            coauthored_pub_tags = coauthor_tag.parent.find_next_sibling(
                'td').find_all('a')
            author = CDBLPAuthor.getEnglishName(coauthor_tag.string.strip())
            author['count'] = len(coauthored_pub_tags)
            author['pubs'] = map(lambda t: t['href'][1:], coauthored_pub_tags)
            coauthors.append(author)

        return coauthors
        #return list(map(lambda a: '{} {}'.format(a['first_name'], a['last_name']), self.coauthors_en))

    @staticmethod
    def getEnglishName(author_name_zh):

        author_name_en_split = CDBLPAuthor.pinyin.hanzi2pinyin(
            author_name_zh.strip())
        # return author's English name
        if isinstance(author_name_en_split, str):
            author_name = {'full_name': author_name_en_split}

        else:
            if len(author_name_zh) > 1:
                author_name = {
                    'zh':
                    author_name_zh,
                    'last_name':
                    author_name_en_split[0].capitalize(),
                    'first_name':
                    author_name_en_split[1].capitalize() +
                    ''.join(author_name_en_split[2:])
                }
                author_name['full_name'] = '{} {}'.format(
                    author_name['first_name'], author_name['last_name'])
                author_name['full_name_reverse'] = '{} {}'.format(
                    author_name['last_name'], author_name['first_name'])
                if len(author_name_zh) == 3:
                    author_name['full_name_dash'] = '{}-{} {}'.format(
                        author_name_en_split[1].capitalize(),
                        author_name_en_split[2], author_name['last_name'])
            else:
                author_name = {
                    'zh': author_name_zh,
                    'last_name': author_name_en_split[0].capitalize(),
                    'first_name': ''
                }
                author_name['full_name'] = '{} {}'.format(
                    author_name['first_name'], author_name['last_name'])
                author_name['full_name_reverse'] = '{} {}'.format(
                    author_name['last_name'], author_name['first_name'])

        return author_name

    @staticmethod
    def get_english_name(author_name_zh, py_obj):

        author_name_en_split = py_obj.hanzi2pinyin(author_name_zh.strip())
        # return author's English name
        if isinstance(author_name_en_split, str):
            author_name = {'full_name': author_name_en_split}

        else:
            if len(author_name_zh) > 1:
                author_name = {
                    'zh':
                    author_name_zh,
                    'last_name':
                    author_name_en_split[0].capitalize(),
                    'first_name':
                    author_name_en_split[1].capitalize() +
                    ''.join(author_name_en_split[2:])
                }
                author_name['full_name'] = '{} {}'.format(
                    author_name['first_name'], author_name['last_name'])
                author_name['full_name_reverse'] = '{} {}'.format(
                    author_name['last_name'], author_name['first_name'])
                if len(author_name_zh) == 3:
                    author_name['full_name_dash'] = '{}-{} {}'.format(
                        author_name_en_split[1].capitalize(),
                        author_name_en_split[2], author_name['last_name'])
            else:
                author_name = {
                    'zh': author_name_zh,
                    'last_name': author_name_en_split[0].capitalize(),
                    'first_name': ''
                }
                author_name['full_name'] = '{} {}'.format(
                    author_name['first_name'], author_name['last_name'])
                author_name['full_name_reverse'] = '{} {}'.format(
                    author_name['last_name'], author_name['first_name'])

        return author_name

    @staticmethod
    def get_publications_by_journal(journal, year, issue):
        res = urlopen('http://cdblp.cn/journal_issue/' +
                      quote('{}/{}/{}'.format(journal, year, issue)))
        dom = BeautifulSoup(res)
        publications = []

        paper_link_tags = dom.find_all(href=re.compile('^/paper'))
        for paper_link_tag in paper_link_tags:
            # table cell tag
            td_tag = paper_link_tag.parent
            # title
            title = paper_link_tag.string
            link = paper_link_tag['href']
            cdblbkey = re.findall('(\d+)(\.html$)', link)[0][0]
            # authors
            authors = []
            for author_tag in td_tag.find_all(href=re.compile('^/author')):
                author_name = author_tag.contents[0]
                if isinstance(author_name, str):
                    authors.append(author_name.strip())

            # publication data
            venue_rec = td_tag.find_all(href=re.compile('^/journal'))
            venue = venue_rec[0].string
            volume_result = re.compile('(/journal)/(.*)/(\d*)/(.*)').findall(
                venue_rec[1]['href'])[0]
            issue_result = re.compile(
                '(/journal_issue)/(.*)/(\d*)/(.*)').findall(
                    venue_rec[2]['href'])[0]
            year = volume_result[-2]
            volume = volume_result[-1]
            number = issue_result[-1]
            pages = venue_rec[-1].next_sibling.string.replace(':', '').strip()

            publication = {
                'title': title,
                'authors': authors,
                'venue-type': 'journal',
                'venue': venue,
                'volume': unquote(volume),
                'number': unquote(number),
                'pages': pages,
                'year': year,
                'cdblpkey': cdblbkey
            }

            publications.append(publication)

        return publications

    @staticmethod
    def get_publication_dict():

        publication_dict = {}

        res = urlopen('http://cdblp.cn/jour_scan.php?fid=journalscan')
        category_dom = BeautifulSoup(res)

        print(
            list(
                map(
                    lambda c: {
                        'title': c.string,
                        'href': 'http://cdblp.cn' + c['href']
                    }, category_dom.find_all(href=re.compile('^/journal')))))

        for journal_tag in category_dom.find_all(href=re.compile('^/journal')):

            journal = journal_tag.string
            print(journal)
            print('http://cdblp.cn' + journal_tag['href'])
            publication_dict[journal] = {}

            res = urlopen('http://cdblp.cn' + journal_tag['href'])
            journal_dom = BeautifulSoup(res)

            for issue_tag in journal_dom.find_all(
                    href=re.compile('^/journal_issue')):
                print(issue_tag.string)
                print(issue_tag['href'])

                issue_result = re.compile(
                    '(/journal_issue)/(.*)/(\d*)/(.*)').findall(
                        issue_tag['href'])[0]
                year = issue_result[-2]
                issue = unquote(issue_result[-1])
                publications = CDBLPAuthor.get_publications_by_journal(
                    journal, year, issue)

                if not publication_dict[journal].__contains__(year):
                    publication_dict[journal][year] = {}

                publication_dict[journal][year][issue] = publications

        return publication_dict

    @staticmethod
    def parallel_get(journal, link):

        publication_dict = {}

        print(journal)
        print(link)

        res = urlopen(link)
        journal_dom = BeautifulSoup(res)

        for issue_tag in journal_dom.find_all(
                href=re.compile('^/journal_issue')):
            #print(issue_tag.string)
            #print(issue_tag['href'])
            try:
                issue_result = re.compile(
                    '(/journal_issue)/(.*)/(\d*)/(.*)').findall(
                        issue_tag['href'])[0]
                year = issue_result[-2]
                issue = unquote(issue_result[-1])
                publications = CDBLPAuthor.get_publications_by_journal(
                    journal, year, issue)

                if not publication_dict.__contains__(year):
                    publication_dict[year] = {}

                publication_dict[year][issue] = publications
            except AttributeError as e:
                print(journal + year + issue)
                print(e)
            except TypeError as et:
                print(journal + year + issue)
                print(et)
            except urllib.error.HTTPError as eh:
                print(journal + year + issue)
                print(eh)

        cache = open('{}-pub-cache.data'.format(journal), 'w')
        cache.write(json.dumps(publication_dict))
        cache.close()

        return publication_dict
Beispiel #21
0
 def __init__(self):
     self.test = PinYin()
     self.test.load_word()
Beispiel #22
0
def get_phonetic(word):
    pinyin = PinYin()
    pinyin.load_word()
    return ''.join(pinyin.hanzipinyin(word))
Beispiel #23
0
# -*- coding: utf-8 -*-
from pinyin import PinYin

if __name__ == '__main__':

    test = PinYin(no_digit=False, no_letter=False)

    test.load_word()

    s = '钓鱼岛是中国的'

    print('"%s" 转换成拼音: %s' % (s, test.to_pinyin(s)))
    print('"%s" 转换成带分隔符的拼音: %s' % (s, test.to_pinyin(s, join_with=' ')))
    print('"%s" 转换成带分隔符的拼音(保留多音字): %s' %
          (s, test.to_pinyin(s, join_with=' ', multi=True)))
    print('"%s" 转换成带分隔符的拼音且首字符大写: %s' %
          (s, test.to_pinyin(s, join_with=' ', capitalize=True)))
    print('"%s" 转换成带分隔符的拼音且首字符大写(保留多音字): %s' %
          (s, test.to_pinyin(s, join_with=' ', capitalize=True, multi=True)))
    print('"%s" 转换成首字母缩写: %s' % (s, test.to_abbr(s)))
    print('"%s" 转换成首字母缩写(保留多音字): %s' % (s, test.to_abbr(s, multi=True)))

    s = '加油中国!加油华为!你行!'

    print('\n\n原始字符串:"%s"' % s)
    # 原始字符串:"加油中国!加油华为!你行!"

    print('拼音首字母: %s ' % test.to_abbr(s))

    print('全拼: %s' % test.to_pinyin(s))
Beispiel #24
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

from pinyin import PinYin
import sys

test = PinYin('application/libraries/pinyin.py/word.data')
test.load_word()

print test.hanzi2pinyin_split(string=sys.argv[1], split='_')
Beispiel #25
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

import re
import xlwt
from pinyin import PinYin, Cartesian_product

test = PinYin()
test.load_word()

wbk = xlwt.Workbook()


def main():
    #files = ['shop_2.csv', 'sight_2.csv']
    files = ['shop_2.csv', 'sight_2.csv', 'district_2_n.csv']
    for f in files:
        func(f)
    wbk.save("/home/chenyp/sharefolder/cn2pinyin.xls")


def func2(filename):
    #餐馆的输入文档
    poitype = filename.split(".")[0].decode('utf-8')
    column2 = u"%sid" % poitype
    count = 0
    lines = open(filename).readlines()[1:100]
    MAX = 35000
    if len(lines) % MAX == 0:
        total = len(lines) / MAX
    else:
Beispiel #26
0
    u"天气预报",
    u"京东",
    u"淘宝",
    u"百度",
    u"微信",
    u"斗鱼",
    u"爱奇艺",
    u"腾讯视频",
    u"qq",
    u"熊猫tv",
    u"快递",
    u'4399',
}


word2pinyin = PinYin()
word2pinyin.load_word()
alphabet = {'a':1, 'b':1, 'c':1, 'd':1, 'e':1, 'f':1, 'g':1,
            'h':1, 'i':1, 'j':1, 'k':1, 'l':1, 'm':1, 'n':1,
            'o':1, 'p':1, 'q':1, 'r':1, 's':1, 't':1,
            'u':1, 'v':1, 'w':1, 'x':1, 'y':1, 'z':1}


def hanzi2pinyi(word):
    result = []
    for hanzi in word:
        if hanzi.lower() in alphabet:
            result.append(hanzi.lower())
        else:
            result.append(word2pinyin.hanzi2pinyin(hanzi))
    return ''.join(result)
Beispiel #27
0
#/usr/bin/env python
# coding=utf-8
import os
import sys
import numpy as np
import pandas as pd
import jieba
import re

sys.path.append('utils/')
import config
from pinyin import PinYin
str2pinyin = PinYin()
jieba.load_userdict(config.jieba_dict)
stopwords = [
    line.strip() for line in open(config.stopwords_path, 'r').readlines()
]
stopwords = [w.decode('utf8') for w in stopwords]
# stopwords=[]
#if config.cut_char_level:
stopwords = [
    u'?',
    u'。',
    u',',
]

use_pinyin = False


def clean_str(x):
    punc = "蚂蚁  了 吗  的 !?。,:;."
Beispiel #28
0
import config
from urllib import quote
from pinyin import PinYin
import magic
import shutil
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

if config.uncompression_enable:
    import uncompression

# load config file
root = config.root
host = config.host
py = PinYin(dict_file=os.getcwd()+'/pinyin/word.data')
py.load_word()

types = [
    ".h",".cpp",".cxx",".cc",".c",".cs",".html",".js",
    ".php",".java",".py",".rb",".as",".jpeg",".jpg",".png",
    ".gif",".ai",".psd",".mp3",".avi",".rmvb",".mp4",".wmv",
    ".mkv",".doc",".docx",".ppt",".pptx",".xls",".xlsx",
    ".zip",".tar",".gz",".7z",".rar",".pdf",".txt",".exe",
    ".apk",".torrent",".srt",".pyc"
]

preview=[]

render = web.template.render('template')
Beispiel #29
0
# -*- coding: utf-8 -*-
import re

from openerp import models, fields, api
from openerp.osv.expression import get_unaccent_wrapper, NEGATIVE_TERM_OPERATORS

from pinyin import PinYin
han2py = PinYin()
han2py.load_word()

# class multiple_name_search(models.Model):
#     _name = 'multiple_name_search.multiple_name_search'

#     name = fields.Char()


# 将name转化为拼音的公共方法
def comman_change_name(name):
    pinyinStr, pyStr = False, False
    if name:  #如果有name

        pinyinArr = han2py.str2pinyin(name)
        print pinyinArr
        pyStr = ''.join([p[0] for p in pinyinArr])
        pinyinStr = ''.join(pinyinArr)
    return {'pinyin': pinyinStr, 'py': pyStr}


class WithPinyinProductTemplate(models.Model):
    _inherit = 'product.template'
def check_contain_english(check_str):
    for ch in check_str.decode('utf-8'):
        if ch <= u'\u4e00' or ch >= u'\u9fff':
            return True
    return False


# import city name into pandas Dataframe
with open('ChinaCityList.json') as json_data:
    d = json.load(json_data)
# extract city name into list
city = json_normalize(data=d, record_path=['city', 'county'])
city_name = city.name.tolist()
city_name = [x.encode('utf-8') for x in city_name]
# Build a dictionary in the form of city:pinying
trans = PinYin()
trans.load_word()
to_py = trans.hanzi2pinyin
city_py = [to_py(x) for x in city_name]
city_dict = dict(zip(city_name, city_py))


# city chain game
def city_chain(city):
    if len(city) == 0:
        print '错误:请确认是否输入汉字'
    elif check_contain_english(city):
        print '错误:请确认是否输入了非汉字'
    else:
        candidate = []
        py_city = to_py(city)