def populate_tr_fields(self): import opencc self.name_tr = opencc.convert(self.name or "", config='s2t.json') self.full_name_tr = opencc.convert(self.full_name or "", config='s2t.json') self.desc_tr = opencc.convert(self.desc or "", config='s2t.json')
def adapt(cls, site, channel, doc, mapping, region): """ 适配 mongodb 里的数据格式到线上存储接口支持的格式 """ form = channel["form"] caller = cls.__dict__[cls._map[form]].__func__ data = caller(cls, site, channel, doc, mapping) # 外媒(29), 奇闻(31) 频道标题和内容繁体转简体 if mapping["first_cid"] in ["29", 29, "31", 31]: import opencc data["title"] = opencc.convert(data["title"]) for i, item in enumerate(data["content"]): if "txt" in item: data["content"][i]["txt"] = opencc.convert( data["content"][i]["txt"]) data["title"] = cls.normalize_unicode(data["title"]) # 归一化一些字符 # 统一一些字段 data["unique_id"] = "%s_%s" % (form, doc["request"]) # docid data["publish_site"] = doc["publish_ori_name"] or site["name"] # pname pt = format_datetime_string(doc["publish_time"], g=True) data["publish_time"] = pt[:10] + "T" + pt[11:] + "Z" # ptime data["insert_time"] = datetime.now().isoformat()[:-7] + "Z" data["site_icon"] = doc["publish_ori_icon"] data["channel_id"] = mapping["first_cid"] if mapping["second_cid"]: # 如果有线上二级频道信息,则上传 data["second_channel_id"] = mapping["second_cid"] if doc.get("tags"): data["tags"] = cls.split_tag_words(doc["tags"]) elif form == "news": # Fixme: 硬判断类型 data["tags"] = cls.generate_tags_for_game( doc["title"], mapping["first_cid"], channel["name"]) # Fixme: 游戏频道需要根据 title 生成 tags else: data["tags"] = list() if form != "video": # 统一计算 image number data["image_number"] = sum( [1 for item in data["content"] if "img" in item]) data["online"] = True if region: # 地理位置信息 if region["province"]: data["province"] = region["province"] if region["city"]: data["city"] = region["city"] if region["county"]: data["district"] = region["county"] # Fixme: 为适配老版本api根据online_source_id拉取信息,新版本不需要该字段 if mapping.get("online_source_sid"): data["source_id"] = mapping["online_source_sid"] # Add: 添加新的字段支持线上数据查找抓取源 data["spider_source_id"] = str(channel["_id"]) return data
def parseGos(link , g_id): resp = requests.get(url=str(link),cookies={"over18":"1"}) soup = BeautifulSoup(resp.text) print(resp) # author author = soup.find(id="main-container").contents[1].contents[0].contents[1].string.replace(' ', '') author = opencc.convert(author) # title title = soup.find(id="main-container").contents[1].contents[2].contents[1].string.replace(' ', '') title = opencc.convert(title) # date date = soup.find(id="main-container").contents[1].contents[3].contents[1].string # ip try: ip = soup.find(text=re.compile("※ 發信站:")) ip = re.search("[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*",str(ip)).group() except: ip = "ip is not find" # content a = str(soup.find(id="main-container").contents[1]) a = a.split("</div>") a = a[4].split("<span class=\"f2\">※ 發信站: 批踢踢實業坊(ptt.cc),") content = a[0].replace(' ', '').replace('\n', '').replace('\t', '') content = re.sub( '<([^>]*)>[^<]*<[^>]*>', '', content) content = re.sub( '<([^>]*)>', '', content) content = re.sub( '[A-Za-z0-9]*', '', content) content = re.sub( '\.', '', content) content = opencc.convert(content) # message num , all , g , b , n ,message = 0,0,0,0,0,{} for tag in soup.find_all("div","push"): num += 1 push_tag = tag.find("span","push-tag").string.replace(' ', '') push_userid = tag.find("span","push-userid").string.replace(' ', '') push_content = tag.find("span","push-content").string.replace(' ', '').replace('\n', '').replace('\t', '').replace(':', '') push_content = opencc.convert(push_content) push_ipdatetime = tag.find("span","push-ipdatetime").string.replace('\n', '') message[num]={"狀態":push_tag,"留言者":push_userid,"留言內容":push_content,"留言時間":push_ipdatetime} if push_tag == '推': g += 1 elif push_tag == '噓': b += 1 else: n += 1 messageNum = {"g":g,"b":b,"n":n,"all":num} # json-data d={ "a_ID":g_id , "b_作者":author , "c_標題":title , "d_日期":date , "e_ip":ip , "f_內文":content , "g_推文":message, "h_推文總數":messageNum } json_data = json.dumps(d,ensure_ascii=False,indent=4,sort_keys=True)+',' store(json_data)
def convert_tree(subs): for sub in subs: if sub.title is not None: sub.title = convert(sub.title) if sub.sec_title is not None: sub.sec_title = convert(sub.sec_title) if isinstance(sub, Sutra): sub.main_lines = [convert(line) for line in sub.main_lines] else: convert_tree(sub.subs)
def getMetadata(cls, audioId): opencc = cls._opencc() youtube = APIService._youtube(authenticate=False) youtubeData = youtube.videos().list(id=audioId, part="snippet").execute()["items"][0]["snippet"] shikData = requests.get(cls.SHIK_API_URL, params={'youtube_id' : audioId}).json() artists = [] if shikData["artist"] is not None: artists.append(opencc.convert(shikData["artist"].encode('utf-8'))) return { "artist": artists, "thumbnail": youtubeData["thumbnails"]["default"]["url"].encode('utf-8'), "title": opencc.convert(youtubeData["title"].encode('utf-8')) }
def proprocess_LDC2005T10(data_path, outpath): from bs4 import BeautifulSoup import opencc chinese_path = os.path.join(data_path, "data/Chinese") english_path = os.path.join(data_path, "data/English") alignment_path = os.path.join(data_path, "data/alignment") # chinese_files = os.listdir(chinese_path) # english_files = os.listdir(english_path) alignment_files = os.listdir(alignment_path) en_outfile = open(os.path.join(outpath, "nmpt.en"),'w') ch_outfile = open(os.path.join(outpath, "nmpt.zh"),'w') c_count_line = 0 e_count_line = 0 for afile in alignment_files: print os.path.join(alignment_path, afile) alignmet_f = open (os.path.join(alignment_path, afile)) a_soup = BeautifulSoup(alignmet_f.read()) chinese_f = open (os.path.join(chinese_path, afile),'r') c_soup = BeautifulSoup(chinese_f.read(), fromEncoding="CP950") english_f = open (os.path.join(english_path, afile),'r') e_soup = BeautifulSoup(english_f.read()) for alig in a_soup.find_all('alignment'): # print alig # print alig['docid'], type(alig['docid']) for sentpair in alig.find_all('sentpair'): # print sentpair if sentpair['chinesesegid'] == "" or sentpair["englishsegid"] == "":continue c1 = c_soup.find_all('doc', attrs={"docid":alig['docid']}) e1 = e_soup.find_all('doc', attrs={"docid":alig['docid']}) if len(c1) < 1 or len(e1) < 1:continue state = 0 for cid in sentpair['chinesesegid'].split(','): # print "zh", cid c2 = c1[0].find_all('seg', attrs={'id':cid}) if len(c2) < 1: state = 1 break line = unicode(c2[0].string).encode('utf-8').strip() line = opencc.convert(line).encode('utf8') ch_outfile.write(line+" ") ch_outfile.write('\n') c_count_line +=1 for eid in sentpair['englishsegid'].split(','): if state == 1: state = 0 break e2 = e1[0].find_all('seg', attrs={'id':eid}) if len(e2) < 1: break line = unicode(e2[0].string).encode('utf-8').strip() en_outfile.write(line+" ") en_outfile.write('\n') e_count_line += 1 alignmet_f.close() chinese_f.close() english_f.close() en_outfile.close() ch_outfile.close() print c_count_line, e_count_line
def process(corpus_path=CORPUS_ROOT, out_dict_path=OUTPUT_DICT): full_dict = [] for root, subFolders, files in os.walk(corpus_path): for name in files: if file_is_valid(root, name): with open(os.path.join(root, name), mode='r', encoding='utf-8') as corpus_file: try: content = corpus_file.read() content_cn = opencc.convert(content, config='t2s.json') corpus_file.close() lines = content.split() lines_cn = content_cn.split() for line in lines: for char in line: if char not in full_dict: full_dict.append(char) for line in lines_cn: for char in line: if char not in full_dict: full_dict.append(char) except Exception as e: traceback.print_exc() print( 'Something not very nice happened with {}; skipping file.' .format(os.path.join(root, name))) if '\n' in full_dict: full_dict.remove('\n') full_dict.sort() with open(out_dict_path, mode='w', encoding='utf-8') as cn_cdict_file: cn_cdict_file.writelines('\n'.join(full_dict))
def translate_to_zh_cn(nikaya_book): """ :param nikaya_book: :type nikaya_book: Nikaya :return: """ nikaya = copy.deepcopy(nikaya_book) nikaya.title_chinese = convert(nikaya.title_chinese) nikaya.languages.append('zh-cn') def convert_tree(subs): for sub in subs: if sub.title is not None: sub.title = convert(sub.title) if sub.sec_title is not None: sub.sec_title = convert(sub.sec_title) if isinstance(sub, Sutra): sub.main_lines = [convert(line) for line in sub.main_lines] else: convert_tree(sub.subs) convert_tree(nikaya.subs) return nikaya
def t2t(text: str, normal: bool, printable: bool, pure: bool = False) -> str: # Convert the string, text to text try: if not text: return "" if normal: for special in ["spc", "spe"]: text = "".join( eval(f"glovar.{special}_dict").get(t, t) for t in text) text = normalize("NFKC", text) if printable: text = "".join(t for t in text if t.isprintable() or t in {"\n", "\r", "\t"}) if normal and glovar.zh_cn: text = convert(text, config="t2s.json") if pure: text = sub(r"""[^\da-zA-Z一-龥.,:'"?!~;()。,?!~@“”]""", "", text) except Exception as e: logger.warning(f"T2T error: {e}", exc_info=True) return text
def predict(content): instance = {} x = [] featureList = [] with open('feature.pkl', 'rb') as f0: featureList = pickle.load(f0) for ele in featureList: instance[ele] = 0 content = opencc.convert(content) for ele in featureList: if ele in content: instance[ele] += 1 term = [] x = [] for ele in featureList: term.append(instance[ele]) x.append(term) X = np.array(x) with open('lsa.pkl', 'rb') as f1: lsa = pickle.load(f1) X = lsa.transform(X) with open('model.pkl', 'rb') as f2: clf = pickle.load(f2) yPred = clf.predict(X) return yPred[0]
def cloneWithSimplified(cls, **kwargs): """ XXX(Yorkie): currently cannot filter the invalid field, will support later. Description: XX.cloneWithSimplified(title=1, name=2) if XX doesn't define title filed, now it(program) will breaks, if we support the filter, this error will be ignored. """ fields = { 'defaults':{} } for name, value in kwargs.items(): val = value if type(value) == str or type(value) == unicode: val = convert(value, config='t2s') try: cls.update_keys.index(name) fields[name] = val except ValueError: fields['defaults'][name] = val except AttributeError: fields[name] = val fields['lang'] = 2 del fields['defaults']['id'] obj, isNew = cls.objects.get_or_create(**fields) for name, value in kwargs.items(): item = getattr(obj, name) if isinstance(item, Model): setattr(obj, name, item.getSimplifiedObject()) return (obj, isNew)
def simple_preprocess(text, *maps, **ops): ''' Simpole preprocess. Args: text: unicode string to process maps: conversion maps ops: operations to do. Supported: trim_space, t2s, full2half, lower. Returns: procesed string. ''' for m in maps: for fr, to in iteritems(m): text = re.sub(fr, to, text) if not text: return text if ops.get('trim_space', False): text = re.sub(u'\s{2,}', ' ', text) if ops.get('t2s', False): import opencc text = opencc.convert(text) if ops.get('full2half', False): text = str_full2half(text) if ops.get('lower', False): text = text.lower() return text
def before_search(self, search_params): if search_params.has_key('q'): if 'owner_org:' not in search_params['q']: q = search_params['q'] q = opencc.convert(q, config='zhtw2zhcn_s.ini') search_params['q'] = u" ".join(jieba.cut(q)) # print search_params['q'] return search_params
def getSimplifiedObject(self): """ if you don't have `title` field or don't want to let this field `title` be the key of language exchange, you should overwrite this method """ simplifiedTitle = convert(self.title, config='t2s') return self.__class__.objects.filter(lang=2, title=simplifiedTitle)[0]
def convert(self, text): """ 未轉簡繁、轉簡體、轉繁體 很慢,不建議使用 """ if self.convert_config is None: return text return opencc.convert(text, config=self.convert_config)
def simplify(trad_string): """Convert Taiwanese Traditional Chinese to Simplified Chinese. Converts a string in (Taiwan) traditional Chinese into simplified Chinese using OpenCC. """ simplified = opencc.convert(trad_string, config="tw2s.json") return (simplified)
def save_model(self, request, obj, form, change): cur_time = gmtime() model = self.model lang = form.cleaned_data.get('lang') fields = {} for field in self._baseFields: val = form.cleaned_data.get(field.name) fields[field.name] = val if not lang: lang = self.__lang__(form) obj.lang = fields['lang'] = lang if self._hasModifyDate: fields['modify_date'] = strftime("%Y-%m-%d %H:%M:%S", cur_time) if self._hasCreateDate: fields['create_date'] = strftime("%Y-%m-%d %H:%M:%S", cur_time) if is_traditional(lang): """ convert value to simplifiedObj first """ simplifiedObj, isNew = model.cloneWithSimplified(**fields) if (not (self._hasModifyDate and not isNew and simplifiedObj.modify_date > obj.modify_date)): for field in self._richTextFields: text = form.cleaned_data.get(field.name) setattr(simplifiedObj, field.name, convert(text)) fields[field.name] = text simplifiedObj.save() """ convert value to traditional """ for name, value in fields.items(): if type(value) == str or type(value) == unicode: setattr(obj, name, convert(value, config='s2t')) if self._hasModifyDate: obj.modify_date = strftime("%Y-%m-%d %H:%M:%S", cur_time) if self._hasCreateDate and not obj.create_date: obj.create_date = strftime("%Y-%m-%d %H:%M:%S", cur_time) return super(ModelAdmin, self).save_model(request, obj, form, change)
def chatbot_tallk(message_text): #message_text = 'hi' chatbot = ChatBot('Ron Obvious',trainer = 'chatterbot.trainers.ChatterBotCorpusTrainer') return_text = chatbot.get_response(message_text) return_text = str(return_text) return_text2 = opencc.convert(return_text, config='mix2zht.ini') return return_text2
def t2s(input_path, output_path): s_lines = '' with open(input_path, 'r') as f: lines = f.readlines() for line in lines: s_line = opencc.convert(line) s_lines += s_line.encode('utf-8') with open(output_path, 'w') as f: f.write(s_lines[:-1])
def opencc_view(request, id): url = 'http://scp-wiki-cn.wikidot.com/scp-{}'.format(id) res = requests.get(url) if res.status_code != 200: return html = opencc.convert(res.text, config='s2tw.json') return HttpResponse(html)
def cut(self, sentence, pos=True): simplified = opencc.convert(sentence, config='tw2s.json') tokenized = self.tokenizer(simplified, pos=pos) recovered = [] head = 0 for tok in tokenized: l = len(tok.word) recovered.append(Word(sentence[head:head + l], pos=tok.pos)) head += l return recovered
def rank(self, word): # returns None if word rank is unknown if word in self.rank_map: return self.rank_map[word] simp_word = opencc.convert(word) if simp_word in self.rank_map: return self.rank_map[simp_word] return None
def convert_to_simple_chinese(self, sentence): ''' 将句子由繁体转为简体 :param sentence: 原始句子 :type sentence: str :return: 简体中文句子 :rtype: str ''' simple_chinese = opencc.convert(sentence, config='zht2zhs.ini') return simple_chinese
def process_chinese_transformation(file_input, file_output, mode='t2s'): with open(file_input, 'r') as f_in, open(file_output, 'w') as f_out: config_mode = mode + '.json' num_total = 0 for num, line in enumerate(f_in): f_out.writelines([ opencc.convert(line, config=config_mode)]) num_total = num + 1 if num_total % 10000 == 0: logger.info('Converted %s lines' % num_total) logger.info('Finished, Converted %s lines' % num_total)
def wiki_replace(d): s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('<gallery>[\s\S]*?</gallery>', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('\n==', '\n\n==', s) s = u'【' + d[0] + u'】\n' + s return opencc.convert(s).strip()
def populate_tr_fields(self): import opencc self.title_tr = opencc.convert(self.title or "", config='s2t.json') self.title_suffix_tr = opencc.convert(self.title_suffix or "", config='s2t.json') self.foreword_tr = opencc.convert(self.foreword or "", config='s2t.json') self.content_tr = opencc.convert(self.content or "", config='s2t.json') self.intro_tr = opencc.convert(self.intro or "", config='s2t.json') self.mobile_title_tr = opencc.convert(self.mobile_title or "", config='s2t.json') self.mobile_content_tr = opencc.convert(self.mobile_content or "", config='s2t.json')
def main(): '''''' ss = Search() title = '成唯识论' import opencc title = opencc.convert(title, config='s2t.json') s = time.time() ss.search(title) e = time.time() print(e-s) for idx in ss.search(title): print(idx, ss.titles[idx])
def convert_tran_sim_save(path, deputy): with open(path + deputy, "r") as f: lines = f.readlines() lines_s = [] lines_t = [] for i, line in enumerate(lines): line = convert_to_unicode(line) lines_s.append(opencc.convert(line, config="t2s.json")) lines_t.append(opencc.convert(line, config="s2t.json")) print(i) print(line) print(lines_s[i]) print(lines_t[i]) with open(path + "_s" + deputy, "w") as f: f.writelines(lines_s) with open(path + "_t" + deputy, "w") as f: f.writelines(lines_t) print(path + deputy, "ok")
def wiki_replace(d): #wikipedia Extractor will del the word with mark like {{}}; wikicorpus will del all punctuation,so I write a fuction to save the punctuation and the word in {{}} s = d[1] s = re.sub(':*{\|[\s\S]*?\|}', '', s) s = re.sub('[\s\S]*?', '', s) s = re.sub('(.){{([^{}\n]*?\|[^{}\n]*?)}}', '\\1[[\\2]]', s) s = filter_wiki(s) s = re.sub('\* *\n|\'{2,}', '', s) s = re.sub('\n+', '\n', s) s = re.sub('\n[:;]|\n +', '\n', s) s = re.sub('\n==', '\n\n==', s) s = u'【' + d[0] + u'】\n' + s return opencc.convert(s).strip()
def process_zhwiki(min_content_length=10): jieba.load_userdict('../var/tw-dict.dict') i = 0 with open('../var/zhwiki_text.txt', 'w') as f_out: for title, content, pageid in gensim.corpora.wikicorpus.extract_pages( bz2.BZ2File('../var/zhwiki-20150901-pages-articles.xml.bz2'), filter_namespaces=('0',)): content = gensim.corpora.wikicorpus.filter_wiki(content) content = opencc.convert(content, 'zhs2zhtw_p.ini') content = keep_acceptable_chars(content) content = regularize_content(content) if len(content) >= min_content_length: try: tag = opencc.convert(title, 'zhs2zhtw_p.ini') content = ' '.join([t for t in jieba.cut(content, cut_all=False) if t != ' ']) f_out.write('%s ::: %s\n' % (tag.encode('utf-8'), content.encode('utf-8'))) i += 1 except: pass if i % 1000 == 0: logger.info("Saved %i articles" % (i)) logger.info("Totally saved %i articles" % (i))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Reading from input files ***") for input_file in input_files: tf.logging.info(" %s", input_file) lines = [] # 未轉 lines_s = [] # 簡體 lines_t = [] # 繁體 for input_file in input_files: with tf.gfile.GFile(input_file, "r") as reader: while True: line = convert_to_unicode(reader.readline()) if not line: break if is_non_content(line): continue # line = extract_chinese(line) lines.append(line) lines_s.append(opencc.convert(line, config="t2s.json")) lines_t.append(opencc.convert(line, config="s2t.json")) output_files = FLAGS.output_file.split(",") tf.logging.info("*** Writing to output files ***") for output_file in output_files: tf.logging.info(" %s", output_file) with tf.gfile.GFile(output_file, "w") as f: f.write(''.join(lines)) with tf.gfile.GFile(output_file + "_s", "w") as f: f.write(''.join(lines_s)) with tf.gfile.GFile(output_file + "_t", "w") as f: f.write(''.join(lines_t))
def index(request): results = None msg = None form = None analogy_examples = [[u'台灣', u'台北', u'法國'], [u'國民黨', u'馬英九', u'民進黨'], [u'海賊王', u'魯夫', u'火影忍者'], [u'爵士樂', u'紐奧良', u'鄉村音樂'], [u'中研院', u'李遠哲', u'工研院'], [u'台灣', u'台灣大學', u'美國'], ] if request.method == 'GET': form = PhraseAnologyQueryForm(request.GET) if request.GET: model = gensim.models.Word2Vec.load('../../var/zhwiki.model') phrase1 = opencc.convert(request.GET['phrase1'], 'zhs2zhtw_p.ini').strip() phrase2 = opencc.convert(request.GET['phrase2'], 'zhs2zhtw_p.ini').strip() phrase3 = opencc.convert(request.GET['phrase3'], 'zhs2zhtw_p.ini').strip() try: results = model.most_similar_cosmul(positive=[phrase3, phrase2], negative=[phrase1]) except KeyError: if phrase1 not in model.vocab: not_exist_phrase = phrase1 elif phrase2 not in model.vocab: not_exist_phrase = phrase2 else: not_exist_phrase = phrase3 msg = u'Word "%s" is not in vocabulary' % (not_exist_phrase) vars = { 'form': form, 'results': results, 'msg': msg, 'analogy_examples': analogy_examples, } return render_to_response('analogy/index.djhtml', vars)
def translate_subtitles(self): subtitles = self.extract_subtitles() for index, subtitle in enumerate(subtitles): content = subtitle.content locale = subtitle.filename.split(".")[-2] if re.match(r"chs", locale): content = content.decode("gbk") content = opencc.convert(content, config="s2t.json") else: content = content.decode("big5") subtitles[index] = SubtitleFile(filename=subtitle.filename, content=content) return subtitles
def polling(self, loop): ''' :param loop: the default event loop :type loop: asyncio.BaseEventLoop ''' offset = 0 while True: try: req = yield from loop.run_in_executor( None, lambda: requests.post(self.prefix + '/getUpdates', timeout=None, data=dict(offset=offset, timeout=self.timeout))) j = req.json() except ValueError: self.debug("ERROR", req.text) continue except ConnectionError as e: self.debug("ERROR", str(e)) continue if not j['ok'] or not j['result']: continue self.debug("receive", json.dumps(j)) # update offset for next `/getUpdates` offset = max([r['update_id'] + 1 for r in j['result']]) for r in j['result']: # process inline queries if 'inline_query' in r: q = r['inline_query'] cid = q['from']['id'] if cid in self.allowed: resp = self.q.inline(q['id'], q['offset'] or 0, q['query']) self.sender.send_resp(cid, resp) elif 'chosen_inline_result' in r: continue # TODO: ??? else: # normal messages m = r['message'] self.debug("message", json.dumps(m)) mid = m['message_id'] cid = m['chat']['id'] if cid in self.allowed and 'text' in m and m['text'][0] == '/': m['text'] = opencc.convert(m['text']) resp = self.q.query(cid, m['text'][1:]) self.sender.send_resp(cid, resp, mid) else: self.sender.send_resp(cid, Resp(message='mew?'))
def load_text(self, text_file): """Load and process source text file whose type can be '.quote' or '.verse'""" self._clear_obj() assert os.path.exists(text_file), text_file assert text_file.endswith('quote') or text_file.endswith('verse') self.src_path = text_file with open(text_file, 'r') as f: text = f.read() if self.jianti: import opencc text = opencc.convert(text, config='t2s.json') if '--------' in text: self._parse_cooked(text) else: self._parse(text)
def pos_extract(): path = '/home/haoming/iPIN/haoming_position_all_14/raw/' file_list = os.listdir(path) output = open('src/Prep_pos.out', 'w') # outfile = open('File_name.out','w') count = 1 for cur_file in file_list: if "crc" in cur_file: continue else: file_name = os.path.join(path, cur_file) # outfile.write(file_name+'\n') f = open(file_name, 'r') end = 0 while not end: line = f.readline() if line != '': job_item, position, description = line.split('\x01', 2) '''旧版本:对于每一个职位名,遇到停用词或标点符号则换行存取为新的职位名''' # position = clean(position) # # if not position.replace(' ', '').isalpha(): # position.strip(' ') # for item in position.split('\n'): # if item not in illegal and item.isdigit() == False: # output.write(item.encode('utf8') + '\n') # count = count + 1 # print count, item '''2015.8.3修改版本:遇到停用词或标点符号抛弃该职位名(英语职位名中出现空格除外)''' position = position.upper() #英文全部转为大写 position = opencc.convert(position, config='zht2zhs.ini') #繁体转为简体 if (position not in illegal) and ( not hasSymbol(position.encode('utf-8')) ): # and (position.isdigit() == False) output.write(position.encode('utf8') + '\n') count = count + 1 print count, position else: end = 1 line = f.readline() f.close() print "*******Complete!*******"
def process_TW(map_CN, locale_TW): # 处理台湾翻译 map_TW = get_kvmap_in_locale_file(locale_TW) error_keys = [] result_TW = '' for k, v in map_TW.items(): if not v and map_CN[k]: try: v_CN = codecs.decode(map_CN[k], 'unicode_escape') v_TW = opencc.convert(v_CN, config='s2twp.json') v_TW = codecs.encode(v_TW, 'unicode_escape').decode() v = v_TW.upper().replace('\\U', '\\u') except Exception as e: print(str(e)) error_keys.append(k) result_TW += k + '=' + v + '\n' with open(locale_TW, 'w') as f: f.write(result_TW) print('请手动处理以下key: ') for k in error_keys: print(k)
def to_record(tweet): """Convert a tweet of type dict to Tweet database instance :param dict tweet: a tweet :return: Tweet database instance :rtype: Tweet """ if 'retweeted_status' in tweet: typ = 'rt' elif tweet['in_reply_to_status_id']: typ = 'reply' elif tweet['is_quote_status']: typ = 'quote' else: typ = 'tweet' timestamp = int(parse_time(tweet['created_at']).timestamp()) text = opencc.convert(tweet['text']) t = Tweet(id=int(tweet['id']), user_id=tweet['user']['id'], type=typ, timestamp=int(timestamp), tweet=json.dumps(tweet), text=text) return t
def pos_extract(): path = '/home/haoming/iPIN/haoming_position_all_14/raw/' file_list = os.listdir(path) output = open('src/Prep_pos.out','w') # outfile = open('File_name.out','w') count = 1 for cur_file in file_list: if "crc" in cur_file: continue else: file_name = os.path.join(path, cur_file) # outfile.write(file_name+'\n') f = open(file_name, 'r') end = 0 while not end: line = f.readline() if line != '': job_item, position, description = line.split('\x01', 2) '''旧版本:对于每一个职位名,遇到停用词或标点符号则换行存取为新的职位名''' # position = clean(position) # # if not position.replace(' ', '').isalpha(): # position.strip(' ') # for item in position.split('\n'): # if item not in illegal and item.isdigit() == False: # output.write(item.encode('utf8') + '\n') # count = count + 1 # print count, item '''2015.8.3修改版本:遇到停用词或标点符号抛弃该职位名(英语职位名中出现空格除外)''' position = position.upper() #英文全部转为大写 position = opencc.convert(position, config = 'zht2zhs.ini') #繁体转为简体 if (position not in illegal) and (not hasSymbol(position.encode('utf-8'))): # and (position.isdigit() == False) output.write(position.encode('utf8') + '\n') count = count + 1 print count, position else: end = 1 line = f.readline() f.close() print "*******Complete!*******"
def clean(text): # delset = string.punctuation + string.digits + ' ' # text = text.translate(None, delset) text = text.upper() text = opencc.convert(text, config='zht2zhs.ini') #繁体转化为简体 for i in symbol: text = text.replace(i, '\n') if '兼' in text and '兼职' not in text: text = text.replace('兼', '\n') if '+' in text and 'C++' not in text: text = text.replace('+', '\n') if '+' in text and 'C++' not in text: text = text.replace('+', '\n') if '#' in text and 'C#' not in text: text = text.replace('#', '\n') if '#' in text and 'C#' not in text: text = text.replace('#', '\n') if '.' in text and '.NET' not in text: text = text.replace('.', '\n') if ' ' in text and not text.replace(' ', '').isalnum(): text = text.replace(' ', '\n') return text
def clean(text): # delset = string.punctuation + string.digits + ' ' # text = text.translate(None, delset) text = text.upper() text = opencc.convert(text, config='zht2zhs.ini') #繁体转化为简体 for i in symbol : text = text.replace(i, '\n') if '兼' in text and '兼职' not in text: text = text.replace('兼', '\n') if '+' in text and 'C++' not in text: text = text.replace('+', '\n') if '+' in text and 'C++' not in text: text = text.replace('+', '\n') if '#' in text and 'C#' not in text: text = text.replace('#', '\n') if '#' in text and 'C#' not in text: text = text.replace('#', '\n') if '.' in text and '.NET' not in text: text = text.replace('.', '\n') if ' ' in text and not text.replace(' ', '').isalnum(): text = text.replace(' ', '\n') return text
def wiki_to_term(): mariadb_info = json.load(open('../etc/mariadb_settings.json')) engine = sqlalchemy.create_engine('mysql+mysqldb://%s:%s@%s/%s' % ( mariadb_info['user'], mariadb_info['pwd'], mariadb_info['host'], mariadb_info['db'])) connection = engine.connect() print 'Processing wikipedia titles' # details of namespace: https://en.wikipedia.org/wiki/Wikipedia:Namespace valid_wiki_namespace = [0, 118] result = connection.execute("""SELECT page_title FROM page WHERE page_namespace in (%s)""" % (','.join([str(s) for s in valid_wiki_namespace]))) terms = set() for i, row in enumerate(result, 1): sys.stdout.write('\r%i / %i' % (i, result.rowcount)) title_unicode = row['page_title'].decode('utf-8') if is_all_chinese_chars(title_unicode) and len(title_unicode) > 1: terms.add(opencc.convert(title_unicode.encode('utf-8'), 'zhs2zhtw_p.ini')) connection.close() sys.stdout.write('\nSorting and saving results...') with open('../var/tw-wiki-dict.dict', 'w') as f_out: for t in sorted(terms): f_out.write('%s\n' % (t.encode('utf-8'))) print 'Done'
def format_tweet(self, tweet): """Format a single tweet. :param dict tweet: a tweet object :return: formatted string of a tweet :rtype: string """ rep = {} entities = tweet.get('entities', {}) for u in entities.get('urls', []) + entities.get('media', []): idx = tuple(u['indices']) rep.setdefault(idx, []).append('[{}]({})'.format( u['display_url'], u.get('media_url', u['expanded_url']))) for u in entities.get('user_mentions', []): idx = tuple(u['indices']) rep.setdefault(idx, []).append('[{}]({})'.format( '@'+u['screen_name'], 'twitter.com/' + u['screen_name'])) for u in entities.get('hashtags', []): idx = tuple(u['indices']) rep.setdefault(idx, []).append('[{}]({})'.format('#'+u['text'], 'twitter.com/hashtag/{}?src=hash'.format(u['text']))) for u in entities.get('symbols', []): idx = tuple(u['indices']) rep.setdefault(idx, []).append('[{}]({})'.format('$'+u['text'], 'twitter.com/search?q=${}&src=ctag'.format(u['text']))) text = list(opencc.convert(tweet['text'])) last = len(text) for idx in sorted(rep.keys(), reverse=True): st, ed = idx if ed < last: # escape other parts text[ed:last] = self.E('_*[`')(unescape(''.join(text[ed:last]))) text[st:ed] = ' '.join(rep[idx]) last = st text[0:last] = self.E('_*[`')(unescape(''.join(text[0:last]))) return ''.join(text)
def index(request): results = None form = None msg = None form = PhraseRelevanceQueryForm(request.GET) selected_model = form.available_models[0][0] sample_phrases = [u'美國', u'魯夫', u'二二八事件', u'宋朝', u'八八風災', u'電話'] if request.method == 'GET': if request.GET: selected_model = request.GET['selected_model'] model = gensim.models.Word2Vec.load(selected_model) phrase1 = opencc.convert(request.GET['phrase1'], 'zhs2zhtw_p.ini').strip() try: results = model.most_similar(phrase1) except KeyError: msg = u'Word "%s" is not in vocabulary' % (phrase1) vars = { 'form': form, 'results': results, 'msg': msg, 'sample_phrases': sample_phrases, 'selected_model': selected_model, } return render_to_response('relevance/index.djhtml', vars)
def before_index(self, pkg_dict): title = pkg_dict['title'] title = opencc.convert(title, config='zhtw2zhcn_s.ini') seg_list = jieba.cut_for_search(title) pkg_dict['title'] = " ".join(seg_list) return pkg_dict
def traditionalize(text): return opencc.convert(text, config="zhs2zht.ini").encode("utf-8")
def populate_tr_fields(self): import opencc self.quote_tr = opencc.convert(self.quote or "", config="s2t.json")
def populate_tr_fields(self): import opencc self.name_tr = opencc.convert(self.name or "", config="s2t.json") self.intro_tr = opencc.convert(self.intro or "", config="s2t.json")
def simplify(text): return opencc.convert(text, config='t2s.json')
def populate_tr_fields(self): import opencc self.name_tr = opencc.convert(self.name or "", config='s2t.json')
def traditionalize(text): return opencc.convert(text, config='zhs2zht.ini').encode('utf-8')
import pickle import opencc data = [] with open("./newsData.pkl", 'rb') as fr: data = pickle.load(fr) dataCn = [] for post in data: postCn = [] for ele in post: postCn.append(opencc.convert(ele)) dataCn.append(postCn) with open("./newsDataCn.pkl", 'wb') as fw: pickle.dump(dataCn, fw)
# gathering snmp data from __future__ import division import re import opencc import os from gensim import corpora from scwsseg.utils import load_scws, cut, load_emotion_words AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data') cut_str = load_scws() emotions_words = load_emotion_words() emotions_words = [unicode(e, 'utf-8') for e in emotions_words] t_emotions_words = [opencc.convert(e, config='zhs2zht.ini') for e in emotions_words] emotions_words.extend(t_emotions_words) emotions_words = [w.encode('utf-8') for w in emotions_words] emotions_words_set = set(emotions_words) emotion_pattern = re.compile(r'\[(\S+?)\]') def if_emoticoned_weibo(r): # 微博是否包含指定的表情符号集 emotions = re.findall(emotion_pattern, r['text']) is_emoticoned = 1 if set(emotions) & emotions_words_set else 0 return is_emoticoned def if_empty_retweet_weibo(r): #暂时没用到该函数,故不作过多考虑
def test_convert(): text = '乾坤一擲' expect = '乾坤一掷' assert convert(text) == expect
def test_parse_content(self): document = self.item.parse_content() converted_keyword = opencc.convert(self.KEYWORD, config="t2s.json") self.assertIn(converted_keyword, document.title.text)