コード例 #1
0
 def run_convert_sentences(self, sentences_path, sentences_market_path) :
     file_operator = TextFileOperator()
     sentences = file_operator.reading(sentences_path)
     sentences = [[word.split('<:>')[0] for word in sentence] for sentence in sentences]
     loader = PickleMarket()
     loader.dump_market(sentences, sentences_market_path)
     print 'converting sentences finished ...'
コード例 #2
0
 def read_article(self, article_path) :
     """ Read source article.
         Each row is an article.
         Colunm[0] is the id of article.
         Column[1:] is the attributes of article.
     """
     file_operator = TextFileOperator()
     data_list = file_operator.reading(article_path)
     entry_list = data_list[0]
     source_list = []
     length = len(data_list[1:]) - 1
     for idx, data in enumerate(data_list[1:]) :
         if len(data) >= len(entry_list) :
             article = dict()
             article['id'] = data[0]
             article['url'] = data[1]
             article['pub_time'] = data[2]
             article['title'] = data[3]
             article['content'] = data[4]
             article['n_zan'] = data[5]
             article['n_forward'] = data[6]
             article['n_click'] = data[7]
             article['n_collect'] = data[8]
             article['read_time'] = data[9]
             article['finish_rate'] = data[10]
             source_list.append(article)
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     return source_list
コード例 #3
0
 def read_participle(self, articles, participle_path) :
     """ Read participle title.
         Each row is an article.
         Colunm[0] is the id of article.
         Column[1] is the word of participle title.
         Column[2] is the word of participle content.
     """
     file_operator = TextFileOperator()
     data_list = file_operator.reading(participle_path)
     entry_list = data_list[0]
     article_dict = dict()
     length = len(data_list[1:]) - 1
     for idx, data in enumerate(data_list[1:]) :
         if len(data) >= len(entry_list) :
             article = dict()
             article['id'] = data[0]
             article['participle_title'] = [Word(word) for word in data[1].split(' ')]
             article['participle_content'] = [Word(word) for word in data[2].split(' ')]
             article_dict[article['id']] = article
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     length = len(articles) - 1
     for idx, article in enumerate(articles) :
         if article['id'] in article_dict :
             article['participle_title'] = article_dict[article['id']]['participle_title']
             article['participle_content'] = article_dict[article['id']]['participle_content']
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     return articles
コード例 #4
0
 def read_sentences(self, source_path, type='all') :
     """ Read participle sentences.
         Each row is a sentence.
     """
     file_operator = TextFileOperator()
     data_list = file_operator.reading(source_path)
     entry_list = data_list[0]
     sentences = list()
     length = len(data_list[1:]) - 1
     if type == 'all' :
         for idx, data in enumerate(data_list[1:]) :
             if len(data) >= len(entry_list) :
                 sentence = [Word(word, sp_char=':').to_string() for word in data[0].split(' ')]
                 sentences.append(sentence)
             if idx % 100 == 0 :
                 print 'finish rate is %.2f%%\r' % (100.0*idx/length),
         print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     elif type == 'name' :
         for idx, data in enumerate(data_list[1:]) :
             if len(data) >= len(entry_list) :
                 sentence = [Word(word, sp_char=':').name for word in data[0].split(' ')]
                 sentences.append(sentence)
             if idx % 100 == 0 :
                 print 'finish rate is %.2f%%\r' % (100.0*idx/length),
         print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     return sentences
コード例 #5
0
 def read_article(self, article_path) :
     """ Read source article.
         Each row is an article.
         Colunm[0] is the id of article.
         Column[1:] is the attributes of article.
     """
     file_operator = TextFileOperator()
     data_list = file_operator.reading(article_path)
     entry_list = data_list[0]
     source_list = []
     length = len(data_list[1:]) - 1
     for idx, data in enumerate(data_list[1:]) :
         if len(data) >= len(entry_list) :
             article = dict()
             article['id'] = data[0]
             article['url'] = data[1]
             article['title'] = data[2]
             article['content'] = data[3]
             article['participle_title'] = [Word(word) for word in data[4].split(' ')]
             article['participle_content'] = [Word(word) for word in data[5].split(' ')]
             article['label'] = data[6]
             source_list.append(article)
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     return source_list
コード例 #6
0
 def _read_dictionary(self, pos_path) :
     file_operator = TextFileOperator()
     data_list = file_operator.reading(pos_path)
     dictionary = dict()
     for data in data_list :
         if len(data) >= 1 :
             dictionary[data[0]] = 0
     return dictionary
コード例 #7
0
 def run_tag_sentences(self, tag_tree_path, sentences_market_path, tags_path, untag_sentence_path) :
     file_operator = TextFileOperator()
     sentences = [u'技能贴 | 黑色打底裤的10种正确穿搭方式', u'春季男鞋韩版潮流行英伦男士休闲鞋']
     cmd_list = file_operator.reading(tag_tree_path)
     tag_tree = TagTree(cmd_list)
     robot = Robot()
     tags, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:1000])
     print 'finish'
コード例 #8
0
 def run_robot(self, tag_tree_path, sentences_market_path, tags_path) :
     robot = Robot()
     loader = PickleMarket()
     file_operator = TextFileOperator()
     cmd_list = file_operator.reading(tag_tree_path)
     tag_tree = TagTree(cmd_list)
     sentences = loader.load_market(sentences_market_path)
     tags = loader.load_market(tags_path)
     print 'start'
     string = raw_input().decode('gb18030')
     # string = u'我想要毛衣'
     sentences = robot.question_and_answer(string, sentences, tags, tag_tree)
コード例 #9
0
 def run_tag_sentences(self, tag_tree_path, sentences_market_path, tags_martket_path, dict_market_path) :
     file_operator = TextFileOperator()
     loader = PickleMarket()
     sentences = loader.load_market(sentences_market_path)
     cmd_list = file_operator.reading(tag_tree_path)
     tag_tree = TagTree(cmd_list)
     robot = Robot()
     tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:])
     loader = JsonMarket()
     loader.dump_market(tags, tags_martket_path)
     loader.dump_market(tag_tree.dict_tuple, dict_market_path)
     print '%.2f%% article >= 1 tags, number is, %d.' \
         % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences)) \
         % len([tag for tag in tags_show if len(tag) >= 1])
コード例 #10
0
class BaseSegementor :

    def __init__(self) :
        pass
        
    def _read_dictionary(self, split_path) :
        self.file_operator = TextFileOperator()
        data_list = self.file_operator.reading(split_path)
        split_dict = dict()
        for data in data_list :
            if len(data) >= 1 :
                if data[0] not in split_dict :
                    split_dict[data[0]] = None
        return split_dict
コード例 #11
0
 def run_test(self, tag_tree_path, sentences_market_path, tags_path, \
     tags_martket_path, untag_sentence_path) :
     file_operator = TextFileOperator()
     loader = PickleMarket()
     sentences = loader.load_market(sentences_market_path)
     cmd_list = file_operator.reading(tag_tree_path)
     tag_tree = TagTree(cmd_list)
     robot = Robot()
     tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:])
     loader = JsonMarket()
     self.write_tags(sentences, tags_show, tags_path)
     loader.dump_market(tags, tags_martket_path)
     file_operator.writing(untag_sentences, untag_sentence_path)
     # loader.dump_market(untag_sentences, sentences_market_path)
     # print '%.2f%% article >= 2 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences))
     print '%.2f%% article >= 3 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences))
コード例 #12
0
 def read_sentences(self, sentences_path) :
     """ Read participle sentences.
         Each row is a sentence.
     """
     file_operator = TextFileOperator()
     data_list = file_operator.reading(sentences_path)
     entry_list = data_list[0]
     sentences = list()
     length = len(data_list[1:]) - 1
     for idx, data in enumerate(data_list[1:]) :
         if len(data) >= len(entry_list) :
             sentence = data[0].upper()
             sentences.append(sentence)
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     return sentences
コード例 #13
0
 def read_participle_title(self, title_path) :
     """ Read participle title.
         Each row is an article.
         Colunm[0] is the id of article.
         Column[1:] is the word of participle title.
     """
     file_operator = TextFileOperator()
     data_list = file_operator.reading(title_path)
     entry_list = data_list[0]
     source_list = list()
     length = len(data_list[1:]) - 1
     for idx, data in enumerate(data_list[1:]) :
         if len(data) >= len(entry_list) :
             article = dict()
             article['id'] = data[0]
             article['participle_title'] = [Word(word) for word in data[1].split(' ')]
             source_list.append(article)
         if idx % 100 == 0 :
             print 'finish rate is %.2f%%\r' % (100.0*idx/length),
     print 'finish rate is %.2f%%\r' % (100.0*idx/length)
     return source_list