Exemple #1
0
    def populate_tr_fields(self):
        import opencc

        self.name_tr = opencc.convert(self.name or "", config='s2t.json')
        self.full_name_tr = opencc.convert(self.full_name or "",
                                           config='s2t.json')
        self.desc_tr = opencc.convert(self.desc or "", config='s2t.json')
Exemple #2
0
    def adapt(cls, site, channel, doc, mapping, region):
        """ 适配 mongodb 里的数据格式到线上存储接口支持的格式 """
        form = channel["form"]
        caller = cls.__dict__[cls._map[form]].__func__
        data = caller(cls, site, channel, doc, mapping)

        # 外媒(29), 奇闻(31) 频道标题和内容繁体转简体
        if mapping["first_cid"] in ["29", 29, "31", 31]:
            import opencc
            data["title"] = opencc.convert(data["title"])
            for i, item in enumerate(data["content"]):
                if "txt" in item:
                    data["content"][i]["txt"] = opencc.convert(
                        data["content"][i]["txt"])
        data["title"] = cls.normalize_unicode(data["title"])  # 归一化一些字符
        # 统一一些字段
        data["unique_id"] = "%s_%s" % (form, doc["request"])  # docid
        data["publish_site"] = doc["publish_ori_name"] or site["name"]  # pname
        pt = format_datetime_string(doc["publish_time"], g=True)
        data["publish_time"] = pt[:10] + "T" + pt[11:] + "Z"  # ptime
        data["insert_time"] = datetime.now().isoformat()[:-7] + "Z"
        data["site_icon"] = doc["publish_ori_icon"]
        data["channel_id"] = mapping["first_cid"]
        if mapping["second_cid"]:  # 如果有线上二级频道信息,则上传
            data["second_channel_id"] = mapping["second_cid"]
        if doc.get("tags"):
            data["tags"] = cls.split_tag_words(doc["tags"])
        elif form == "news":  # Fixme: 硬判断类型
            data["tags"] = cls.generate_tags_for_game(
                doc["title"], mapping["first_cid"],
                channel["name"])  # Fixme: 游戏频道需要根据 title 生成 tags
        else:
            data["tags"] = list()
        if form != "video":  # 统一计算 image number
            data["image_number"] = sum(
                [1 for item in data["content"] if "img" in item])
        data["online"] = True
        if region:  # 地理位置信息
            if region["province"]:
                data["province"] = region["province"]
            if region["city"]:
                data["city"] = region["city"]
            if region["county"]:
                data["district"] = region["county"]
        # Fixme: 为适配老版本api根据online_source_id拉取信息,新版本不需要该字段
        if mapping.get("online_source_sid"):
            data["source_id"] = mapping["online_source_sid"]

        # Add: 添加新的字段支持线上数据查找抓取源
        data["spider_source_id"] = str(channel["_id"])
        return data
Exemple #3
0
def parseGos(link , g_id):
    resp = requests.get(url=str(link),cookies={"over18":"1"})
    soup = BeautifulSoup(resp.text)
    print(resp)
    # author
    author  = soup.find(id="main-container").contents[1].contents[0].contents[1].string.replace(' ', '')
    author = opencc.convert(author)
    # title
    title = soup.find(id="main-container").contents[1].contents[2].contents[1].string.replace(' ', '')
    title = opencc.convert(title)
    # date
    date = soup.find(id="main-container").contents[1].contents[3].contents[1].string
    # ip
    try:
        ip = soup.find(text=re.compile("※ 發信站:"))
        ip = re.search("[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*",str(ip)).group()
    except:
        ip = "ip is not find"
    # content
    a = str(soup.find(id="main-container").contents[1])
    a = a.split("</div>")
    a = a[4].split("<span class=\"f2\">※ 發信站: 批踢踢實業坊(ptt.cc),")
    content = a[0].replace(' ', '').replace('\n', '').replace('\t', '')
    content = re.sub( '<([^>]*)>[^<]*<[^>]*>', '', content)
    content = re.sub( '<([^>]*)>', '', content)
    content = re.sub( '[A-Za-z0-9]*', '', content)
    content = re.sub( '\.', '', content)
    content = opencc.convert(content)
    # message
    num , all , g , b , n ,message = 0,0,0,0,0,{}
    for tag in soup.find_all("div","push"):
        num += 1
        push_tag = tag.find("span","push-tag").string.replace(' ', '')
        push_userid = tag.find("span","push-userid").string.replace(' ', '')
        push_content = tag.find("span","push-content").string.replace(' ', '').replace('\n', '').replace('\t', '').replace(':', '')
        push_content = opencc.convert(push_content)
        push_ipdatetime = tag.find("span","push-ipdatetime").string.replace('\n', '')

        message[num]={"狀態":push_tag,"留言者":push_userid,"留言內容":push_content,"留言時間":push_ipdatetime}
        if push_tag == '推':
            g += 1
        elif push_tag == '噓':
            b += 1
        else:
            n += 1
    messageNum = {"g":g,"b":b,"n":n,"all":num}
    # json-data
    d={ "a_ID":g_id , "b_作者":author , "c_標題":title , "d_日期":date , "e_ip":ip , "f_內文":content , "g_推文":message, "h_推文總數":messageNum }
    json_data = json.dumps(d,ensure_ascii=False,indent=4,sort_keys=True)+','

    store(json_data)
Exemple #4
0
    def convert_tree(subs):
        for sub in subs:

            if sub.title is not None:
                sub.title = convert(sub.title)

            if sub.sec_title is not None:
                sub.sec_title = convert(sub.sec_title)

            if isinstance(sub, Sutra):
                sub.main_lines = [convert(line) for line in sub.main_lines]

            else:
                convert_tree(sub.subs)
Exemple #5
0
    def getMetadata(cls, audioId):

        opencc = cls._opencc()
        youtube = APIService._youtube(authenticate=False)
        youtubeData = youtube.videos().list(id=audioId, part="snippet").execute()["items"][0]["snippet"]
        shikData = requests.get(cls.SHIK_API_URL, params={'youtube_id' : audioId}).json()

        artists = []
        if shikData["artist"] is not None:
            artists.append(opencc.convert(shikData["artist"].encode('utf-8')))
        return {
            "artist": artists,
            "thumbnail": youtubeData["thumbnails"]["default"]["url"].encode('utf-8'),
            "title": opencc.convert(youtubeData["title"].encode('utf-8'))
        }
Exemple #6
0
def proprocess_LDC2005T10(data_path, outpath):
  from bs4 import BeautifulSoup  
  import opencc
  chinese_path = os.path.join(data_path, "data/Chinese")
  english_path = os.path.join(data_path, "data/English")
  alignment_path = os.path.join(data_path, "data/alignment")
  # chinese_files = os.listdir(chinese_path)
  # english_files = os.listdir(english_path)
  alignment_files = os.listdir(alignment_path)
  en_outfile = open(os.path.join(outpath, "nmpt.en"),'w')
  ch_outfile = open(os.path.join(outpath, "nmpt.zh"),'w')
  c_count_line = 0
  e_count_line = 0
  for afile in alignment_files:
    print os.path.join(alignment_path, afile)
    alignmet_f = open (os.path.join(alignment_path, afile))
    a_soup = BeautifulSoup(alignmet_f.read())
    chinese_f = open (os.path.join(chinese_path, afile),'r')
    c_soup = BeautifulSoup(chinese_f.read(), fromEncoding="CP950")
    english_f = open (os.path.join(english_path, afile),'r')
    e_soup = BeautifulSoup(english_f.read())

    for alig in a_soup.find_all('alignment'): 
      # print alig
      # print alig['docid'], type(alig['docid'])
      for sentpair in alig.find_all('sentpair'):
        # print sentpair
        if sentpair['chinesesegid'] == "" or sentpair["englishsegid"] == "":continue
        c1 = c_soup.find_all('doc', attrs={"docid":alig['docid']})
        e1 = e_soup.find_all('doc', attrs={"docid":alig['docid']})
        if len(c1) < 1 or len(e1) < 1:continue
        state = 0
        for cid in sentpair['chinesesegid'].split(','):
          # print "zh", cid
          c2 = c1[0].find_all('seg', attrs={'id':cid})
          if len(c2) < 1:
            state = 1
            break
          line = unicode(c2[0].string).encode('utf-8').strip()
          line = opencc.convert(line).encode('utf8')
          ch_outfile.write(line+" ")
        ch_outfile.write('\n')
        c_count_line +=1
        for eid in sentpair['englishsegid'].split(','):
          if state == 1:
            state = 0
            break
          e2 = e1[0].find_all('seg', attrs={'id':eid})
          if len(e2) < 1:
            break
          line = unicode(e2[0].string).encode('utf-8').strip()
          en_outfile.write(line+" ")
        en_outfile.write('\n')
        e_count_line += 1
    alignmet_f.close()
    chinese_f.close()
    english_f.close()
  en_outfile.close()
  ch_outfile.close()
  print c_count_line, e_count_line
Exemple #7
0
def process(corpus_path=CORPUS_ROOT, out_dict_path=OUTPUT_DICT):

    full_dict = []

    for root, subFolders, files in os.walk(corpus_path):
        for name in files:
            if file_is_valid(root, name):
                with open(os.path.join(root, name), mode='r',
                          encoding='utf-8') as corpus_file:
                    try:
                        content = corpus_file.read()
                        content_cn = opencc.convert(content, config='t2s.json')
                        corpus_file.close()
                        lines = content.split()
                        lines_cn = content_cn.split()
                        for line in lines:
                            for char in line:
                                if char not in full_dict:
                                    full_dict.append(char)
                        for line in lines_cn:
                            for char in line:
                                if char not in full_dict:
                                    full_dict.append(char)
                    except Exception as e:
                        traceback.print_exc()
                        print(
                            'Something not very nice happened with {}; skipping file.'
                            .format(os.path.join(root, name)))

    if '\n' in full_dict:
        full_dict.remove('\n')
    full_dict.sort()

    with open(out_dict_path, mode='w', encoding='utf-8') as cn_cdict_file:
        cn_cdict_file.writelines('\n'.join(full_dict))
Exemple #8
0
def translate_to_zh_cn(nikaya_book):
    """
    :param nikaya_book:
     :type nikaya_book: Nikaya
    :return:
    """

    nikaya = copy.deepcopy(nikaya_book)

    nikaya.title_chinese = convert(nikaya.title_chinese)
    nikaya.languages.append('zh-cn')

    def convert_tree(subs):
        for sub in subs:

            if sub.title is not None:
                sub.title = convert(sub.title)

            if sub.sec_title is not None:
                sub.sec_title = convert(sub.sec_title)

            if isinstance(sub, Sutra):
                sub.main_lines = [convert(line) for line in sub.main_lines]

            else:
                convert_tree(sub.subs)

    convert_tree(nikaya.subs)

    return nikaya
def proprocess_LDC2005T10(data_path, outpath):
  from bs4 import BeautifulSoup  
  import opencc
  chinese_path = os.path.join(data_path, "data/Chinese")
  english_path = os.path.join(data_path, "data/English")
  alignment_path = os.path.join(data_path, "data/alignment")
  # chinese_files = os.listdir(chinese_path)
  # english_files = os.listdir(english_path)
  alignment_files = os.listdir(alignment_path)
  en_outfile = open(os.path.join(outpath, "nmpt.en"),'w')
  ch_outfile = open(os.path.join(outpath, "nmpt.zh"),'w')
  c_count_line = 0
  e_count_line = 0
  for afile in alignment_files:
    print os.path.join(alignment_path, afile)
    alignmet_f = open (os.path.join(alignment_path, afile))
    a_soup = BeautifulSoup(alignmet_f.read())
    chinese_f = open (os.path.join(chinese_path, afile),'r')
    c_soup = BeautifulSoup(chinese_f.read(), fromEncoding="CP950")
    english_f = open (os.path.join(english_path, afile),'r')
    e_soup = BeautifulSoup(english_f.read())

    for alig in a_soup.find_all('alignment'): 
      # print alig
      # print alig['docid'], type(alig['docid'])
      for sentpair in alig.find_all('sentpair'):
        # print sentpair
        if sentpair['chinesesegid'] == "" or sentpair["englishsegid"] == "":continue
        c1 = c_soup.find_all('doc', attrs={"docid":alig['docid']})
        e1 = e_soup.find_all('doc', attrs={"docid":alig['docid']})
        if len(c1) < 1 or len(e1) < 1:continue
        state = 0
        for cid in sentpair['chinesesegid'].split(','):
          # print "zh", cid
          c2 = c1[0].find_all('seg', attrs={'id':cid})
          if len(c2) < 1:
            state = 1
            break
          line = unicode(c2[0].string).encode('utf-8').strip()
          line = opencc.convert(line).encode('utf8')
          ch_outfile.write(line+" ")
        ch_outfile.write('\n')
        c_count_line +=1
        for eid in sentpair['englishsegid'].split(','):
          if state == 1:
            state = 0
            break
          e2 = e1[0].find_all('seg', attrs={'id':eid})
          if len(e2) < 1:
            break
          line = unicode(e2[0].string).encode('utf-8').strip()
          en_outfile.write(line+" ")
        en_outfile.write('\n')
        e_count_line += 1
    alignmet_f.close()
    chinese_f.close()
    english_f.close()
  en_outfile.close()
  ch_outfile.close()
  print c_count_line, e_count_line
Exemple #10
0
def t2t(text: str, normal: bool, printable: bool, pure: bool = False) -> str:
    # Convert the string, text to text
    try:
        if not text:
            return ""

        if normal:
            for special in ["spc", "spe"]:
                text = "".join(
                    eval(f"glovar.{special}_dict").get(t, t) for t in text)

            text = normalize("NFKC", text)

        if printable:
            text = "".join(t for t in text
                           if t.isprintable() or t in {"\n", "\r", "\t"})

        if normal and glovar.zh_cn:
            text = convert(text, config="t2s.json")

        if pure:
            text = sub(r"""[^\da-zA-Z一-龥.,:'"?!~;()。,?!~@“”]""", "", text)
    except Exception as e:
        logger.warning(f"T2T error: {e}", exc_info=True)

    return text
Exemple #11
0
def predict(content):
    instance = {}
    x = []

    featureList = []

    with open('feature.pkl', 'rb') as f0:
        featureList = pickle.load(f0)

    for ele in featureList:
        instance[ele] = 0

    content = opencc.convert(content)

    for ele in featureList:
        if ele in content:
            instance[ele] += 1

    term = []
    x = []
    for ele in featureList:
        term.append(instance[ele])
    x.append(term)

    X = np.array(x)

    with open('lsa.pkl', 'rb') as f1:
        lsa = pickle.load(f1)
    X = lsa.transform(X)

    with open('model.pkl', 'rb') as f2:
        clf = pickle.load(f2)

    yPred = clf.predict(X)
    return yPred[0]
Exemple #12
0
  def cloneWithSimplified(cls, **kwargs):
    """
    XXX(Yorkie): currently cannot filter the invalid field, will
    support later.

    Description:
    XX.cloneWithSimplified(title=1, name=2)
    if XX doesn't define title filed, now it(program) will breaks,
    if we support the filter, this error will be ignored.
    """
    fields = { 'defaults':{} }
    for name, value in kwargs.items():
      val = value
      if type(value) == str or type(value) == unicode:
        val = convert(value, config='t2s')
      try:
        cls.update_keys.index(name)
        fields[name] = val
      except ValueError:
        fields['defaults'][name] = val
      except AttributeError:
        fields[name] = val
    fields['lang'] = 2
    del fields['defaults']['id']
    obj, isNew = cls.objects.get_or_create(**fields)
    
    for name, value in kwargs.items():
      item = getattr(obj, name)
      if isinstance(item, Model):
        setattr(obj, name, item.getSimplifiedObject())
    return (obj, isNew)
Exemple #13
0
def simple_preprocess(text, *maps, **ops):
    ''' Simpole preprocess.

    Args:
      text: unicode string to process
      maps: conversion maps
      ops: operations to do. Supported: trim_space, t2s, full2half, lower.

    Returns:
      procesed string.
    '''
    for m in maps:
        for fr, to in iteritems(m):
            text = re.sub(fr, to, text)
    if not text:
        return text
    if ops.get('trim_space', False):
        text = re.sub(u'\s{2,}', ' ', text)
    if ops.get('t2s', False):
        import opencc
        text = opencc.convert(text)
    if ops.get('full2half', False):
        text = str_full2half(text)
    if ops.get('lower', False):
        text = text.lower()
    return text
Exemple #14
0
 def before_search(self, search_params):
     if search_params.has_key('q'):
         if 'owner_org:' not in search_params['q']:
             q = search_params['q']
             q = opencc.convert(q, config='zhtw2zhcn_s.ini')
             search_params['q'] = u" ".join(jieba.cut(q))
             # print search_params['q']
     return search_params
Exemple #15
0
 def getSimplifiedObject(self):
   """
   if you don't have `title` field or don't want to let this field
   `title` be the key of language exchange, you should overwrite this
   method
   """
   simplifiedTitle = convert(self.title, config='t2s')
   return self.__class__.objects.filter(lang=2, title=simplifiedTitle)[0]
Exemple #16
0
 def before_search(self, search_params):
     if search_params.has_key('q'):
         if 'owner_org:' not in search_params['q']:
             q = search_params['q']
             q = opencc.convert(q, config='zhtw2zhcn_s.ini')
             search_params['q'] = u" ".join(jieba.cut(q))
             # print search_params['q']
     return search_params
Exemple #17
0
 def convert(self, text):
     """
 未轉簡繁、轉簡體、轉繁體
 很慢,不建議使用
 """
     if self.convert_config is None:
         return text
     return opencc.convert(text, config=self.convert_config)
Exemple #18
0
def simplify(trad_string):
    """Convert Taiwanese Traditional Chinese to Simplified Chinese.

    Converts a string in (Taiwan) traditional Chinese into simplified
    Chinese using OpenCC.
    """
    simplified = opencc.convert(trad_string, config="tw2s.json")
    return (simplified)
Exemple #19
0
  def save_model(self, request, obj, form, change):
    cur_time = gmtime()
    model = self.model
    lang = form.cleaned_data.get('lang')
    fields = {}

    for field in self._baseFields:
      val = form.cleaned_data.get(field.name)
      fields[field.name] = val

    if not lang:
      lang = self.__lang__(form)
    obj.lang = fields['lang'] = lang

    if self._hasModifyDate:
      fields['modify_date'] = strftime("%Y-%m-%d %H:%M:%S", cur_time)
    if self._hasCreateDate:
      fields['create_date'] = strftime("%Y-%m-%d %H:%M:%S", cur_time)

    if is_traditional(lang):
      """
      convert value to simplifiedObj first
      """
      simplifiedObj, isNew = model.cloneWithSimplified(**fields)
      if (not (self._hasModifyDate and
        not isNew and simplifiedObj.modify_date > obj.modify_date)):
        for field in self._richTextFields:
          text = form.cleaned_data.get(field.name)
          setattr(simplifiedObj, field.name, convert(text))
          fields[field.name] = text
        simplifiedObj.save()

      """
      convert value to traditional
      """
      for name, value in fields.items():
        if type(value) == str or type(value) == unicode:
          setattr(obj, name, convert(value, config='s2t'))

    if self._hasModifyDate:
      obj.modify_date = strftime("%Y-%m-%d %H:%M:%S", cur_time)
    if self._hasCreateDate and not obj.create_date:
      obj.create_date = strftime("%Y-%m-%d %H:%M:%S", cur_time)

    return super(ModelAdmin, self).save_model(request, obj, form, change)
Exemple #20
0
 def chatbot_tallk(message_text):
     #message_text = 'hi'
     chatbot = ChatBot('Ron Obvious',trainer = 'chatterbot.trainers.ChatterBotCorpusTrainer')
     return_text = chatbot.get_response(message_text)
     return_text = str(return_text)
     
     return_text2 = opencc.convert(return_text, config='mix2zht.ini')
 
     return return_text2
Exemple #21
0
def t2s(input_path, output_path):
    s_lines = ''
    with open(input_path, 'r') as f:
        lines = f.readlines()
    for line in lines:
        s_line = opencc.convert(line)
        s_lines += s_line.encode('utf-8')
    with open(output_path, 'w') as f:
        f.write(s_lines[:-1])
Exemple #22
0
def opencc_view(request, id):
    url = 'http://scp-wiki-cn.wikidot.com/scp-{}'.format(id)
    res = requests.get(url)

    if res.status_code != 200:
        return

    html = opencc.convert(res.text, config='s2tw.json')

    return HttpResponse(html)
Exemple #23
0
 def cut(self, sentence, pos=True):
     simplified = opencc.convert(sentence, config='tw2s.json')
     tokenized = self.tokenizer(simplified, pos=pos)
     recovered = []
     head = 0
     for tok in tokenized:
         l = len(tok.word)
         recovered.append(Word(sentence[head:head + l], pos=tok.pos))
         head += l
     return recovered
Exemple #24
0
  def rank(self, word):
    # returns None if word rank is unknown
    if word in self.rank_map:
      return self.rank_map[word]

    simp_word = opencc.convert(word)
    if simp_word in self.rank_map:
      return self.rank_map[simp_word]

    return None
Exemple #25
0
    def convert_to_simple_chinese(self, sentence):
        '''
            将句子由繁体转为简体

        :param sentence: 原始句子
        :type sentence: str
        :return: 简体中文句子
        :rtype: str
        '''
        simple_chinese = opencc.convert(sentence, config='zht2zhs.ini')
        return simple_chinese
Exemple #26
0
    def convert_to_simple_chinese(self, sentence):
        '''
            将句子由繁体转为简体

        :param sentence: 原始句子
        :type sentence: str
        :return: 简体中文句子
        :rtype: str
        '''
        simple_chinese = opencc.convert(sentence, config='zht2zhs.ini')
        return simple_chinese
Exemple #27
0
def process_chinese_transformation(file_input, file_output, mode='t2s'):
    with open(file_input, 'r') as f_in, open(file_output, 'w') as f_out:
        config_mode = mode + '.json'
        num_total = 0
        for num, line in enumerate(f_in):
            f_out.writelines([
                opencc.convert(line, config=config_mode)])
            num_total = num + 1
            if num_total % 10000 == 0:
                logger.info('Converted %s lines' % num_total)
        logger.info('Finished, Converted %s lines' % num_total)
def wiki_replace(d):
    s = d[1]
    s = re.sub(':*{\|[\s\S]*?\|}', '', s)
    s = re.sub('<gallery>[\s\S]*?</gallery>', '', s)
    s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
    s = filter_wiki(s)
    s = re.sub('\* *\n|\'{2,}', '', s)
    s = re.sub('\n+', '\n', s)
    s = re.sub('\n[:;]|\n +', '\n', s)
    s = re.sub('\n==', '\n\n==', s)
    s = u'【' + d[0] + u'】\n' + s
    return opencc.convert(s).strip()
Exemple #29
0
    def populate_tr_fields(self):
        import opencc

        self.title_tr = opencc.convert(self.title or "", config='s2t.json')
        self.title_suffix_tr = opencc.convert(self.title_suffix or "", config='s2t.json')
        self.foreword_tr = opencc.convert(self.foreword or "", config='s2t.json')
        self.content_tr = opencc.convert(self.content or "", config='s2t.json')
        self.intro_tr = opencc.convert(self.intro or "", config='s2t.json')
        self.mobile_title_tr = opencc.convert(self.mobile_title or "", config='s2t.json')
        self.mobile_content_tr = opencc.convert(self.mobile_content or "", config='s2t.json')
Exemple #30
0
def main():
    ''''''
    ss = Search()
    title = '成唯识论'
    import opencc
    title = opencc.convert(title, config='s2t.json')
    s = time.time()
    ss.search(title)
    e = time.time()
    print(e-s)
    for idx in ss.search(title):
        print(idx, ss.titles[idx])
Exemple #31
0
def convert_tran_sim_save(path, deputy):
    with open(path + deputy, "r") as f:
        lines = f.readlines()

    lines_s = []
    lines_t = []
    for i, line in enumerate(lines):
        line = convert_to_unicode(line)
        lines_s.append(opencc.convert(line, config="t2s.json"))
        lines_t.append(opencc.convert(line, config="s2t.json"))
        print(i)
        print(line)
        print(lines_s[i])
        print(lines_t[i])

    with open(path + "_s" + deputy, "w") as f:
        f.writelines(lines_s)
    with open(path + "_t" + deputy, "w") as f:
        f.writelines(lines_t)

    print(path + deputy, "ok")
Exemple #32
0
def wiki_replace(d):
    #wikipedia Extractor will del the word with mark like {{}}; wikicorpus will del all punctuation,so I write a fuction to save the punctuation and the word in {{}}
    s = d[1]
    s = re.sub(':*{\|[\s\S]*?\|}', '', s)
    s = re.sub('[\s\S]*?', '', s)
    s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s)
    s = filter_wiki(s)
    s = re.sub('\* *\n|\'{2,}', '', s)
    s = re.sub('\n+', '\n', s)
    s = re.sub('\n[:;]|\n +', '\n', s)
    s = re.sub('\n==', '\n\n==', s)
    s = u'【' + d[0] + u'】\n' + s
    return opencc.convert(s).strip()
def process_zhwiki(min_content_length=10):
    jieba.load_userdict('../var/tw-dict.dict')
    i = 0
    with open('../var/zhwiki_text.txt', 'w') as f_out:
        for title, content, pageid in gensim.corpora.wikicorpus.extract_pages(
                bz2.BZ2File('../var/zhwiki-20150901-pages-articles.xml.bz2'), filter_namespaces=('0',)):
            content = gensim.corpora.wikicorpus.filter_wiki(content)
            content = opencc.convert(content, 'zhs2zhtw_p.ini')
            content = keep_acceptable_chars(content)
            content = regularize_content(content)
            if len(content) >= min_content_length:
                try:
                    tag = opencc.convert(title, 'zhs2zhtw_p.ini')
                    content = ' '.join([t for t in jieba.cut(content, cut_all=False) if t != ' '])
                    f_out.write('%s ::: %s\n' % (tag.encode('utf-8'), content.encode('utf-8')))
                    i += 1
                except:
                    pass
                if i % 1000 == 0:
                    logger.info("Saved %i articles" % (i))

    logger.info("Totally saved %i articles" % (i))
Exemple #34
0
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  input_files = []
  for input_pattern in FLAGS.input_file.split(","):
    input_files.extend(tf.gfile.Glob(input_pattern))

  tf.logging.info("*** Reading from input files ***")
  for input_file in input_files:
    tf.logging.info("  %s", input_file)
  
  lines = []   # 未轉
  lines_s = [] # 簡體
  lines_t = [] # 繁體
  for input_file in input_files:
    with tf.gfile.GFile(input_file, "r") as reader:
      while True:
        line = convert_to_unicode(reader.readline())
        if not line:
          break
        if is_non_content(line):
          continue
        # line = extract_chinese(line)
        lines.append(line)
        lines_s.append(opencc.convert(line, config="t2s.json"))
        lines_t.append(opencc.convert(line, config="s2t.json"))
        
  output_files = FLAGS.output_file.split(",")
  tf.logging.info("*** Writing to output files ***")
  for output_file in output_files:
    tf.logging.info("  %s", output_file)

  with tf.gfile.GFile(output_file, "w") as f:
      f.write(''.join(lines))
  with tf.gfile.GFile(output_file + "_s", "w") as f:
      f.write(''.join(lines_s))
  with tf.gfile.GFile(output_file + "_t", "w") as f:
      f.write(''.join(lines_t))
Exemple #35
0
def index(request):
    results = None
    msg = None
    form = None
    analogy_examples = [[u'台灣', u'台北', u'法國'],
                        [u'國民黨', u'馬英九', u'民進黨'],
                        [u'海賊王', u'魯夫', u'火影忍者'],
                        [u'爵士樂', u'紐奧良', u'鄉村音樂'],
                        [u'中研院', u'李遠哲', u'工研院'],
                        [u'台灣', u'台灣大學', u'美國'],
                       ]
    if request.method == 'GET':
        form = PhraseAnologyQueryForm(request.GET)
        if request.GET:
            model = gensim.models.Word2Vec.load('../../var/zhwiki.model')
            phrase1 = opencc.convert(request.GET['phrase1'], 'zhs2zhtw_p.ini').strip()
            phrase2 = opencc.convert(request.GET['phrase2'], 'zhs2zhtw_p.ini').strip()
            phrase3 = opencc.convert(request.GET['phrase3'], 'zhs2zhtw_p.ini').strip()
            try:
                results = model.most_similar_cosmul(positive=[phrase3, phrase2], negative=[phrase1])
            except KeyError:
                if phrase1 not in model.vocab:
                    not_exist_phrase = phrase1
                elif phrase2 not in model.vocab:
                    not_exist_phrase = phrase2
                else:
                    not_exist_phrase = phrase3
                msg = u'Word "%s" is not in vocabulary' % (not_exist_phrase)

    vars = {
        'form': form,
        'results': results,
        'msg': msg,
        'analogy_examples': analogy_examples,
    }
    return render_to_response('analogy/index.djhtml', vars)
Exemple #36
0
    def translate_subtitles(self):
        subtitles = self.extract_subtitles()
        for index, subtitle in enumerate(subtitles):
            content = subtitle.content

            locale = subtitle.filename.split(".")[-2]
            if re.match(r"chs", locale):
                content = content.decode("gbk")
                content = opencc.convert(content, config="s2t.json")
            else:
                content = content.decode("big5")

            subtitles[index] = SubtitleFile(filename=subtitle.filename,
                                            content=content)
        return subtitles
Exemple #37
0
    def populate_tr_fields(self):
        import opencc

        self.title_tr = opencc.convert(self.title or "", config='s2t.json')
        self.title_suffix_tr = opencc.convert(self.title_suffix or "",
                                              config='s2t.json')
        self.foreword_tr = opencc.convert(self.foreword or "",
                                          config='s2t.json')
        self.content_tr = opencc.convert(self.content or "", config='s2t.json')
        self.intro_tr = opencc.convert(self.intro or "", config='s2t.json')
        self.mobile_title_tr = opencc.convert(self.mobile_title or "",
                                              config='s2t.json')
        self.mobile_content_tr = opencc.convert(self.mobile_content or "",
                                                config='s2t.json')
Exemple #38
0
    def polling(self, loop):
        '''
        :param loop: the default event loop
        :type loop: asyncio.BaseEventLoop
        '''
        offset = 0
        while True:
            try:
                req = yield from loop.run_in_executor(
                    None, lambda: requests.post(self.prefix + '/getUpdates', timeout=None,
                                                data=dict(offset=offset, timeout=self.timeout)))
                j = req.json()
            except ValueError:
                self.debug("ERROR", req.text)
                continue
            except ConnectionError as e:
                self.debug("ERROR", str(e))
                continue
            if not j['ok'] or not j['result']:
                continue
            self.debug("receive", json.dumps(j))
            # update offset for next `/getUpdates`
            offset = max([r['update_id'] + 1 for r in j['result']])
            for r in j['result']:
                # process inline queries
                if 'inline_query' in r:
                    q = r['inline_query']
                    cid = q['from']['id']
                    if cid in self.allowed:
                        resp = self.q.inline(q['id'], q['offset'] or 0, q['query'])
                        self.sender.send_resp(cid, resp)

                elif 'chosen_inline_result' in r:
                    continue  # TODO: ???

                else:  # normal messages
                    m = r['message']
                    self.debug("message", json.dumps(m))
                    mid = m['message_id']
                    cid = m['chat']['id']
                    if cid in self.allowed and 'text' in m and m['text'][0] == '/':
                        m['text'] = opencc.convert(m['text'])
                        resp = self.q.query(cid, m['text'][1:])
                        self.sender.send_resp(cid, resp, mid)
                    else:
                        self.sender.send_resp(cid, Resp(message='mew?'))
Exemple #39
0
    def load_text(self, text_file):
        """Load and process source text file whose 
        type can be '.quote' or '.verse'"""
        self._clear_obj()
        assert os.path.exists(text_file), text_file
        assert text_file.endswith('quote') or text_file.endswith('verse')
        self.src_path = text_file
        with open(text_file, 'r') as f:
            text = f.read()

        if self.jianti:
            import opencc
            text = opencc.convert(text, config='t2s.json')

        if '--------' in text:
            self._parse_cooked(text)
        else:
            self._parse(text)
Exemple #40
0
def pos_extract():
    path = '/home/haoming/iPIN/haoming_position_all_14/raw/'
    file_list = os.listdir(path)
    output = open('src/Prep_pos.out', 'w')
    # outfile = open('File_name.out','w')
    count = 1

    for cur_file in file_list:
        if "crc" in cur_file:
            continue
        else:
            file_name = os.path.join(path, cur_file)
            #         outfile.write(file_name+'\n')
            f = open(file_name, 'r')
            end = 0
            while not end:
                line = f.readline()
                if line != '':
                    job_item, position, description = line.split('\x01', 2)
                    '''旧版本:对于每一个职位名,遇到停用词或标点符号则换行存取为新的职位名'''
                    #                     position = clean(position)                #
                    #                     if not position.replace(' ', '').isalpha():
                    #                         position.strip(' ')
                    #                     for item in position.split('\n'):
                    #                         if item not in illegal and item.isdigit() == False:
                    #                             output.write(item.encode('utf8') + '\n')
                    #                             count =  count + 1
                    #                             print count,  item
                    '''2015.8.3修改版本:遇到停用词或标点符号抛弃该职位名(英语职位名中出现空格除外)'''
                    position = position.upper()  #英文全部转为大写
                    position = opencc.convert(position,
                                              config='zht2zhs.ini')  #繁体转为简体
                    if (position not in illegal) and (
                            not hasSymbol(position.encode('utf-8'))
                    ):  # and (position.isdigit() == False)
                        output.write(position.encode('utf8') + '\n')
                        count = count + 1
                        print count, position
                else:
                    end = 1
        line = f.readline()
    f.close()
    print "*******Complete!*******"
Exemple #41
0
def process_TW(map_CN, locale_TW):
    # 处理台湾翻译
    map_TW = get_kvmap_in_locale_file(locale_TW)
    error_keys = []
    result_TW = ''
    for k, v in map_TW.items():
        if not v and map_CN[k]:
            try:
                v_CN = codecs.decode(map_CN[k], 'unicode_escape')
                v_TW = opencc.convert(v_CN, config='s2twp.json')
                v_TW = codecs.encode(v_TW, 'unicode_escape').decode()
                v = v_TW.upper().replace('\\U', '\\u')
            except Exception as e:
                print(str(e))
                error_keys.append(k)
        result_TW += k + '=' + v + '\n'
    with open(locale_TW, 'w') as f:
        f.write(result_TW)
    print('请手动处理以下key: ')
    for k in error_keys:
        print(k)
Exemple #42
0
def to_record(tweet):
    """Convert a tweet of type dict to Tweet database instance

    :param dict tweet: a tweet
    :return: Tweet database instance
    :rtype: Tweet
    """
    if 'retweeted_status' in tweet:
        typ = 'rt'
    elif tweet['in_reply_to_status_id']:
        typ = 'reply'
    elif tweet['is_quote_status']:
        typ = 'quote'
    else:
        typ = 'tweet'
    timestamp = int(parse_time(tweet['created_at']).timestamp())
    text = opencc.convert(tweet['text'])
    t = Tweet(id=int(tweet['id']), user_id=tweet['user']['id'],
              type=typ, timestamp=int(timestamp), tweet=json.dumps(tweet),
              text=text)
    return t
Exemple #43
0
def pos_extract():
    path = '/home/haoming/iPIN/haoming_position_all_14/raw/'
    file_list = os.listdir(path)
    output = open('src/Prep_pos.out','w')
    # outfile = open('File_name.out','w')
    count = 1
    
    for cur_file in file_list:
        if "crc" in cur_file:
            continue
        else:
            file_name = os.path.join(path, cur_file)
    #         outfile.write(file_name+'\n')
            f = open(file_name, 'r')
            end = 0
            while not end:
                line = f.readline()
                if line != '':
                    job_item, position, description = line.split('\x01', 2)
                    '''旧版本:对于每一个职位名,遇到停用词或标点符号则换行存取为新的职位名'''
#                     position = clean(position)                #
#                     if not position.replace(' ', '').isalpha():
#                         position.strip(' ')
#                     for item in position.split('\n'):
#                         if item not in illegal and item.isdigit() == False:
#                             output.write(item.encode('utf8') + '\n')
#                             count =  count + 1  
#                             print count,  item
                    '''2015.8.3修改版本:遇到停用词或标点符号抛弃该职位名(英语职位名中出现空格除外)'''
                    position = position.upper()                                                      #英文全部转为大写
                    position = opencc.convert(position, config = 'zht2zhs.ini')     #繁体转为简体
                    if (position not in illegal) and (not hasSymbol(position.encode('utf-8'))): # and (position.isdigit() == False) 
                        output.write(position.encode('utf8') + '\n')
                        count =  count + 1  
                        print count,  position
                else:
                    end = 1
        line = f.readline()
    f.close()
    print "*******Complete!*******"
Exemple #44
0
def clean(text):
    #     delset = string.punctuation + string.digits + ' '
    #     text = text.translate(None, delset)
    text = text.upper()
    text = opencc.convert(text, config='zht2zhs.ini')  #繁体转化为简体
    for i in symbol:
        text = text.replace(i, '\n')
    if '兼' in text and '兼职' not in text:
        text = text.replace('兼', '\n')
    if '+' in text and 'C++' not in text:
        text = text.replace('+', '\n')
    if '+' in text and 'C++' not in text:
        text = text.replace('+', '\n')
    if '#' in text and 'C#' not in text:
        text = text.replace('#', '\n')
    if '#' in text and 'C#' not in text:
        text = text.replace('#', '\n')
    if '.' in text and '.NET' not in text:
        text = text.replace('.', '\n')
    if ' ' in text and not text.replace(' ', '').isalnum():
        text = text.replace(' ', '\n')
    return text
Exemple #45
0
def clean(text):
#     delset = string.punctuation + string.digits + ' '
#     text = text.translate(None, delset)
    text = text.upper()
    text = opencc.convert(text, config='zht2zhs.ini')   #繁体转化为简体
    for i in symbol :
        text = text.replace(i, '\n')
    if '兼' in text and '兼职' not in text:
        text = text.replace('兼', '\n')
    if '+' in text and 'C++' not in text:
        text = text.replace('+', '\n')
    if '+' in text and 'C++' not in text:
        text = text.replace('+', '\n')
    if '#' in text and 'C#' not in text:
        text = text.replace('#', '\n')
    if '#' in text and 'C#' not in text:
        text = text.replace('#', '\n')
    if '.' in text and '.NET' not in text:
        text = text.replace('.', '\n')
    if ' ' in text and not text.replace(' ', '').isalnum():
        text = text.replace(' ', '\n')
    return text
def wiki_to_term():
    mariadb_info = json.load(open('../etc/mariadb_settings.json'))
    engine = sqlalchemy.create_engine('mysql+mysqldb://%s:%s@%s/%s' % (
            mariadb_info['user'], mariadb_info['pwd'], mariadb_info['host'], mariadb_info['db']))
    connection = engine.connect()

    print 'Processing wikipedia titles'
    # details of namespace: https://en.wikipedia.org/wiki/Wikipedia:Namespace
    valid_wiki_namespace = [0, 118]
    result = connection.execute("""SELECT page_title FROM page WHERE page_namespace in (%s)""" % (','.join([str(s) for s in valid_wiki_namespace])))
    terms = set()
    for i, row in enumerate(result, 1):
        sys.stdout.write('\r%i / %i' % (i, result.rowcount))
        title_unicode = row['page_title'].decode('utf-8')
        if is_all_chinese_chars(title_unicode) and len(title_unicode) > 1:
            terms.add(opencc.convert(title_unicode.encode('utf-8'), 'zhs2zhtw_p.ini'))
    connection.close()

    sys.stdout.write('\nSorting and saving results...')
    with open('../var/tw-wiki-dict.dict', 'w') as f_out:
        for t in sorted(terms):
            f_out.write('%s\n' % (t.encode('utf-8')))
    print 'Done'
Exemple #47
0
    def format_tweet(self, tweet):
        """Format a single tweet.

        :param dict tweet: a tweet object
        :return: formatted string of a tweet
        :rtype: string
        """
        rep = {}
        entities = tweet.get('entities', {})
        for u in entities.get('urls', []) + entities.get('media', []):
            idx = tuple(u['indices'])
            rep.setdefault(idx, []).append('[{}]({})'.format(
                u['display_url'], u.get('media_url', u['expanded_url'])))
        for u in entities.get('user_mentions', []):
            idx = tuple(u['indices'])
            rep.setdefault(idx, []).append('[{}]({})'.format(
                '@'+u['screen_name'], 'twitter.com/' + u['screen_name']))
        for u in entities.get('hashtags', []):
            idx = tuple(u['indices'])
            rep.setdefault(idx, []).append('[{}]({})'.format('#'+u['text'],
                'twitter.com/hashtag/{}?src=hash'.format(u['text'])))
        for u in entities.get('symbols', []):
            idx = tuple(u['indices'])
            rep.setdefault(idx, []).append('[{}]({})'.format('$'+u['text'],
                'twitter.com/search?q=${}&src=ctag'.format(u['text'])))

        text = list(opencc.convert(tweet['text']))
        last = len(text)
        for idx in sorted(rep.keys(), reverse=True):
            st, ed = idx
            if ed < last:  # escape other parts
                text[ed:last] = self.E('_*[`')(unescape(''.join(text[ed:last])))
            text[st:ed] = ' '.join(rep[idx])
            last = st
        text[0:last] = self.E('_*[`')(unescape(''.join(text[0:last])))
        return ''.join(text)
Exemple #48
0
def index(request):
    results = None
    form = None
    msg = None
    form = PhraseRelevanceQueryForm(request.GET)
    selected_model = form.available_models[0][0]
    sample_phrases = [u'美國', u'魯夫', u'二二八事件', u'宋朝', u'八八風災', u'電話']
    if request.method == 'GET':
        if request.GET:
            selected_model = request.GET['selected_model']
            model = gensim.models.Word2Vec.load(selected_model)
            phrase1 = opencc.convert(request.GET['phrase1'], 'zhs2zhtw_p.ini').strip()
            try:
                results = model.most_similar(phrase1)
            except KeyError:
                msg = u'Word "%s" is not in vocabulary' % (phrase1)
    vars = {
        'form': form,
        'results': results,
        'msg': msg,
        'sample_phrases': sample_phrases,
        'selected_model': selected_model,
    }
    return render_to_response('relevance/index.djhtml', vars)
Exemple #49
0
 def before_index(self, pkg_dict):
     title = pkg_dict['title']
     title = opencc.convert(title, config='zhtw2zhcn_s.ini')
     seg_list = jieba.cut_for_search(title)
     pkg_dict['title'] = " ".join(seg_list)
     return pkg_dict
Exemple #50
0
def traditionalize(text):
    return opencc.convert(text, config="zhs2zht.ini").encode("utf-8")
Exemple #51
0
    def populate_tr_fields(self):
        import opencc

        self.quote_tr = opencc.convert(self.quote or "", config="s2t.json")
Exemple #52
0
    def populate_tr_fields(self):
        import opencc

        self.name_tr = opencc.convert(self.name or "", config="s2t.json")
        self.intro_tr = opencc.convert(self.intro or "", config="s2t.json")
Exemple #53
0
def simplify(text):
 return opencc.convert(text, config='t2s.json')
Exemple #54
0
    def populate_tr_fields(self):
        import opencc

        self.name_tr = opencc.convert(self.name or "", config='s2t.json')
Exemple #55
0
def traditionalize(text):
 return opencc.convert(text, config='zhs2zht.ini').encode('utf-8')
Exemple #56
0
import pickle
import opencc

data = []
with open("./newsData.pkl", 'rb') as fr:
    data = pickle.load(fr)
dataCn = []
for post in data:
    postCn = []
    for ele in post:
        postCn.append(opencc.convert(ele))
    dataCn.append(postCn)

with open("./newsDataCn.pkl", 'wb') as fw:
    pickle.dump(dataCn, fw)
#  gathering snmp data
from __future__ import division
import re
import opencc
import os
from gensim import corpora
from scwsseg.utils import load_scws, cut, load_emotion_words

AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')

cut_str = load_scws()

emotions_words = load_emotion_words()
emotions_words = [unicode(e, 'utf-8') for e in emotions_words]
t_emotions_words = [opencc.convert(e, config='zhs2zht.ini') for e in emotions_words]
emotions_words.extend(t_emotions_words)
emotions_words = [w.encode('utf-8') for w in emotions_words]
emotions_words_set = set(emotions_words)
emotion_pattern = re.compile(r'\[(\S+?)\]')


def if_emoticoned_weibo(r):
    # 微博是否包含指定的表情符号集
    emotions = re.findall(emotion_pattern, r['text'])
    is_emoticoned = 1 if set(emotions) & emotions_words_set else 0
    return is_emoticoned


def if_empty_retweet_weibo(r):
    #暂时没用到该函数,故不作过多考虑
Exemple #58
0
def test_convert():
    text = '乾坤一擲'
    expect = '乾坤一掷'
    assert convert(text) == expect
Exemple #59
0
 def test_parse_content(self):
     document = self.item.parse_content()
     converted_keyword = opencc.convert(self.KEYWORD, config="t2s.json")
     self.assertIn(converted_keyword, document.title.text)