def kg_search_parse(self, response): item = response.meta['item'] result = response.body description = re.findall(r'"articleBody": (.*)?,', result) if len(description) != 0: description = HanziConv.toSimplified(description[0]) else: description = u'暂无数据' # print 'description: ' + description subtitle = re.findall(r'"name":(.*)?,', result) if len(subtitle) != 0: subtitle = HanziConv.toSimplified(subtitle[0]) else: subtitle = u'暂无数据' yield { 'type': 'shopping', 'city': item['city'], 'name': item['name'], 'address': item['address'], 'mobile': item['mobile'], 'open_hours': item['open_hours'], 'description': description, 'lat': item['lat'], 'lng': item['lng'], 'subtitle': subtitle }
def kg_search_parse(self, response): """ :description 对饭店名字进行知识图谱 :param response: :return: """ item = response.meta['item'] result = response.body description = re.findall(r'"articleBody": (.*)?,', result) if len(description) != 0: description = HanziConv.toSimplified(description[0]) else: description = u'暂无数据' item['description'] = description subtitle = re.findall(r'"name":(.*)?,', result) if len(subtitle) != 0: subtitle = HanziConv.toSimplified(subtitle[0]) else: subtitle = u'暂无数据' item['subtitle'] = subtitle baike_url = 'https://baike.baidu.com/item/%s' % item['name'] yield scrapy.Request(baike_url, callback=self.baike_parse, meta={'item': item}, dont_filter=True)
def process_line_as_training(l): if sys.version_info >= (3, 0): decoded_line = HanziConv.toSimplified( l.decode('utf8')).strip().split('\u3000') else: decoded_line = HanziConv.toSimplified( l.decode('utf8')).strip().split(u'\u3000') return [w.strip('\r\n') for w in decoded_line]
def findAnswer(no): ansN = 0 ans = '' qstring = cutQuestion1(no) tStart = time.time() result = Google(qstring) tEnd = time.time() period = tEnd - tStart print "Google over It cost %f sec" % period for op in "ABC": score = result.count(q_list[no][op]) if score > ansN: ans = op ansN = score if ansN == 0: for op in "ABC": score = result.count(HanziConv.toSimplified(q_list[no][op])) if score > ansN: ans = op ansN = score else: ans = 'B' print ans, q_list[no][ans] outstring = ''.join(['Ans: ', ans, ' ', q_list[no][ans], '\n']) outfile.write(outstring) return ans
def tokenize(self, text): text = HanziConv.toSimplified(text) sents = [] for raw_sent in self.css_model.tokenize(text): sent = " ".join(list(self.cws_model.tokenize(raw_sent))).strip() sents.append(sent) text = "\n\n".join(sents) lowercase_text = text.lower() paragraph_breaks = self._mark_paragraph_breaks(text) text_length = len(lowercase_text) # Tokenization step starts here # Remove punctuation nopunct_text = ''.join(c for c in lowercase_text if not is_punctuation_mark(c)) nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text) tokseqs = self._divide_to_tokensequences(nopunct_text) # The morphological stemming step mentioned in the TextTile # paper is not implemented. A comment in the original C # implementation states that it offers no benefit to the # process. It might be interesting to test the existing # stemmers though. # words = _stem_words(words) # Filter stopwords for ts in tokseqs: ts.wrdindex_list = [ wi for wi in ts.wrdindex_list if len(wi[0]) > 1 and wi[0] not in self.stopwords ] token_table = self._create_token_table(tokseqs, nopunct_par_breaks) # End of the Tokenization step gap_scores = self._block_comparison(tokseqs, token_table) smooth_scores = self._smooth_scores(gap_scores) depth_scores = self._depth_scores(smooth_scores) segment_boundaries = self._identify_boundaries(depth_scores) normalized_boundaries = self._normalize_boundaries( text, segment_boundaries, paragraph_breaks) segmented_text = [] prevb = 0 for b in normalized_boundaries: if b == 0: continue segmented_text.append(text[prevb:b]) prevb = b if prevb < text_length: # append any text that may be remaining segmented_text.append(text[prevb:]) if not segmented_text: segmented_text = [text] if self.demo_mode: return gap_scores, smooth_scores, depth_scores, segment_boundaries return segmented_text
def perform(self, text): if self.traditional: simplified_text = HanziConv.toSimplified(text) else: simplified_text = text try: req = urllib.request.Request(url=self.url, data=simplified_text.encode("utf-8"), method="POST") with urllib.request.urlopen(req, timeout=15000) as f: results = json.loads(f.read().decode("utf-8")) except Exception: print("Fail to parse the input: %s" % text) return [] return self.output(results, text)
def _create_vocab(path_list): """ Create vocab objects """ counter = Counter() row_count = 0 for file_path in path_list: print("Processing" + file_path) with open(file_path, 'rb') as f: for l in f: counter.update(HanziConv.toSimplified(l.decode('utf8'))) row_count = row_count + 1 print("Total char:", len(counter)) # Filter uncommon words and sort by descending count. word_counts = [x for x in counter.items() if x is not ' '] word_counts.sort(key=lambda x: x[1], reverse=True) print("Words in vocabulary:", len(word_counts)) # Write out the word counts file. with open(FLAGS.word_counts_output_file, "wb") as f: #line = str("\n".join(["%s %d" % (w, c) for w, c in word_counts])) line = ["%s %d" % (w, c) for w, c in word_counts] line = "\n".join(w for w in line).encode('utf8') f.write(line) print("Wrote vocabulary file:", FLAGS.word_counts_output_file) # Create the vocabulary dictionary. reverse_vocab = [x[0] for x in word_counts] unk_id = len(reverse_vocab) vocab_dict = dict([(x, y) for (y, x) in enumerate(reverse_vocab)]) id_vocab_dict = dict([(y, x) for (y, x) in enumerate(reverse_vocab)]) vocab = Vocabulary(vocab_dict, id_vocab_dict, unk_id) return vocab
def findAnswer(no): ansN = 0 ans = '' qstring = cutQuestion1(no) print "%d Googling... " % no result = Google(qstring) print "Google over " for op in "ABC": score = result.count(q_list[no][op]) if score > ansN: ans = op ansN = score if ansN == 0: for op in "ABC": score = result.count(HanziConv.toSimplified(q_list[no][op])) if score > ansN: ans = op ansN = score else: ans = 'B' # print ans, ansN return ans
def process_line_cityu(l): decoded_line = HanziConv.toSimplified(l.decode('utf8')).strip().split(' ') return [w.strip('\r\n') for w in decoded_line]