Beispiel #1
0
 def kg_search_parse(self, response):
     item = response.meta['item']
     result = response.body
     description = re.findall(r'"articleBody": (.*)?,', result)
     if len(description) != 0:
         description = HanziConv.toSimplified(description[0])
     else:
         description = u'暂无数据'
     # print 'description: ' + description
     subtitle = re.findall(r'"name":(.*)?,', result)
     if len(subtitle) != 0:
         subtitle = HanziConv.toSimplified(subtitle[0])
     else:
         subtitle = u'暂无数据'
     yield {
         'type': 'shopping',
         'city': item['city'],
         'name': item['name'],
         'address': item['address'],
         'mobile': item['mobile'],
         'open_hours': item['open_hours'],
         'description': description,
         'lat': item['lat'],
         'lng': item['lng'],
         'subtitle': subtitle
     }
 def kg_search_parse(self, response):
     """
     :description
         对饭店名字进行知识图谱
     :param response: 
     :return: 
     """
     item = response.meta['item']
     result = response.body
     description = re.findall(r'"articleBody": (.*)?,', result)
     if len(description) != 0:
         description = HanziConv.toSimplified(description[0])
     else:
         description = u'暂无数据'
     item['description'] = description
     subtitle = re.findall(r'"name":(.*)?,', result)
     if len(subtitle) != 0:
         subtitle = HanziConv.toSimplified(subtitle[0])
     else:
         subtitle = u'暂无数据'
     item['subtitle'] = subtitle
     baike_url = 'https://baike.baidu.com/item/%s' % item['name']
     yield scrapy.Request(baike_url,
                          callback=self.baike_parse,
                          meta={'item': item},
                          dont_filter=True)
def process_line_as_training(l):
    if sys.version_info >= (3, 0):
        decoded_line = HanziConv.toSimplified(
            l.decode('utf8')).strip().split('\u3000')
    else:
        decoded_line = HanziConv.toSimplified(
            l.decode('utf8')).strip().split(u'\u3000')
    return [w.strip('\r\n') for w in decoded_line]
Beispiel #4
0
def process_line_as_training(l):
    if sys.version_info >= (3, 0):
        decoded_line = HanziConv.toSimplified(
            l.decode('utf8')).strip().split('\u3000')
    else:
        decoded_line = HanziConv.toSimplified(
            l.decode('utf8')).strip().split(u'\u3000')
    return [w.strip('\r\n') for w in decoded_line]
def findAnswer(no):
    ansN = 0
    ans = ''
    qstring = cutQuestion1(no)
    tStart = time.time()
    result = Google(qstring)
    tEnd = time.time()
    period = tEnd - tStart
    print "Google over It cost %f sec" % period
    for op in "ABC":
        score = result.count(q_list[no][op])
        if score > ansN:
            ans = op
            ansN = score
    if ansN == 0:
        for op in "ABC":
            score = result.count(HanziConv.toSimplified(q_list[no][op]))
            if score > ansN:
                ans = op
                ansN = score
        else:
            ans = 'B'
    print ans, q_list[no][ans]
    outstring = ''.join(['Ans: ', ans, ' ', q_list[no][ans], '\n'])
    outfile.write(outstring)
    return ans
Beispiel #6
0
    def tokenize(self, text):
        text = HanziConv.toSimplified(text)
        sents = []
        for raw_sent in self.css_model.tokenize(text):
            sent = " ".join(list(self.cws_model.tokenize(raw_sent))).strip()
            sents.append(sent)
        text = "\n\n".join(sents)

        lowercase_text = text.lower()
        paragraph_breaks = self._mark_paragraph_breaks(text)
        text_length = len(lowercase_text)

        # Tokenization step starts here

        # Remove punctuation
        nopunct_text = ''.join(c for c in lowercase_text
                               if not is_punctuation_mark(c))
        nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text)

        tokseqs = self._divide_to_tokensequences(nopunct_text)

        # The morphological stemming step mentioned in the TextTile
        # paper is not implemented.  A comment in the original C
        # implementation states that it offers no benefit to the
        # process. It might be interesting to test the existing
        # stemmers though.
        # words = _stem_words(words)

        # Filter stopwords
        for ts in tokseqs:
            ts.wrdindex_list = [
                wi for wi in ts.wrdindex_list
                if len(wi[0]) > 1 and wi[0] not in self.stopwords
            ]

        token_table = self._create_token_table(tokseqs, nopunct_par_breaks)
        # End of the Tokenization step

        gap_scores = self._block_comparison(tokseqs, token_table)
        smooth_scores = self._smooth_scores(gap_scores)
        depth_scores = self._depth_scores(smooth_scores)
        segment_boundaries = self._identify_boundaries(depth_scores)
        normalized_boundaries = self._normalize_boundaries(
            text, segment_boundaries, paragraph_breaks)
        segmented_text = []
        prevb = 0
        for b in normalized_boundaries:
            if b == 0:
                continue
            segmented_text.append(text[prevb:b])
            prevb = b
        if prevb < text_length:  # append any text that may be remaining
            segmented_text.append(text[prevb:])
        if not segmented_text:
            segmented_text = [text]
        if self.demo_mode:
            return gap_scores, smooth_scores, depth_scores, segment_boundaries
        return segmented_text
 def perform(self, text):
     if self.traditional:
         simplified_text = HanziConv.toSimplified(text)
     else:
         simplified_text = text
     try:
         req = urllib.request.Request(url=self.url,
                                      data=simplified_text.encode("utf-8"),
                                      method="POST")
         with urllib.request.urlopen(req, timeout=15000) as f:
             results = json.loads(f.read().decode("utf-8"))
     except Exception:
         print("Fail to parse the input: %s" % text)
         return []
     return self.output(results, text)
def _create_vocab(path_list):
    """
    Create vocab objects
    """

    counter = Counter()
    row_count = 0

    for file_path in path_list:
        print("Processing" + file_path)
        with open(file_path, 'rb') as f:
            for l in f:
                counter.update(HanziConv.toSimplified(l.decode('utf8')))
                row_count = row_count + 1

    print("Total char:", len(counter))

    # Filter uncommon words and sort by descending count.
    word_counts = [x for x in counter.items() if x is not ' ']
    word_counts.sort(key=lambda x: x[1], reverse=True)
    print("Words in vocabulary:", len(word_counts))

    # Write out the word counts file.
    with open(FLAGS.word_counts_output_file, "wb") as f:

        #line = str("\n".join(["%s %d" % (w, c) for w, c in word_counts]))
        line = ["%s %d" % (w, c) for w, c in word_counts]
        line = "\n".join(w for w in line).encode('utf8')

        f.write(line)
    print("Wrote vocabulary file:", FLAGS.word_counts_output_file)

    # Create the vocabulary dictionary.
    reverse_vocab = [x[0] for x in word_counts]
    unk_id = len(reverse_vocab)
    vocab_dict = dict([(x, y) for (y, x) in enumerate(reverse_vocab)])
    id_vocab_dict = dict([(y, x) for (y, x) in enumerate(reverse_vocab)])
    vocab = Vocabulary(vocab_dict, id_vocab_dict, unk_id)

    return vocab
def _create_vocab(path_list):
    """
    Create vocab objects
    """

    counter = Counter()
    row_count = 0

    for file_path in path_list:
        print("Processing" + file_path)
        with open(file_path, 'rb') as f:
            for l in f:
                counter.update(HanziConv.toSimplified(l.decode('utf8')))
                row_count = row_count + 1

    print("Total char:", len(counter))

    # Filter uncommon words and sort by descending count.
    word_counts = [x for x in counter.items() if x is not ' ']
    word_counts.sort(key=lambda x: x[1], reverse=True)
    print("Words in vocabulary:", len(word_counts))

    # Write out the word counts file.
    with open(FLAGS.word_counts_output_file, "wb") as f:

        #line = str("\n".join(["%s %d" % (w, c) for w, c in word_counts]))
        line = ["%s %d" % (w, c) for w, c in word_counts]
        line = "\n".join(w for w in line).encode('utf8')

        f.write(line)
    print("Wrote vocabulary file:", FLAGS.word_counts_output_file)

    # Create the vocabulary dictionary.
    reverse_vocab = [x[0] for x in word_counts]
    unk_id = len(reverse_vocab)
    vocab_dict = dict([(x, y) for (y, x) in enumerate(reverse_vocab)])
    id_vocab_dict = dict([(y, x) for (y, x) in enumerate(reverse_vocab)])
    vocab = Vocabulary(vocab_dict, id_vocab_dict, unk_id)

    return vocab
Beispiel #10
0
def findAnswer(no):
    ansN = 0
    ans = ''
    qstring = cutQuestion1(no)
    print "%d Googling... " % no
    result = Google(qstring)
    print "Google over " 
    for op in "ABC":
        score = result.count(q_list[no][op])
        if score > ansN:
            ans = op
            ansN = score
    if ansN == 0:
        for op in "ABC":
            score = result.count(HanziConv.toSimplified(q_list[no][op]))
            if score > ansN:
                ans = op
                ansN = score
        else:
            ans = 'B'
    # print ans, ansN
    return ans
def process_line_cityu(l):
    decoded_line = HanziConv.toSimplified(l.decode('utf8')).strip().split(' ')
    return [w.strip('\r\n') for w in decoded_line]
Beispiel #12
0
def process_line_cityu(l):
    decoded_line = HanziConv.toSimplified(l.decode('utf8')).strip().split(' ')
    return [w.strip('\r\n') for w in decoded_line]