def run_convert_sentences(self, sentences_path, sentences_market_path) : file_operator = TextFileOperator() sentences = file_operator.reading(sentences_path) sentences = [[word.split('<:>')[0] for word in sentence] for sentence in sentences] loader = PickleMarket() loader.dump_market(sentences, sentences_market_path) print 'converting sentences finished ...'
def read_article(self, article_path) : """ Read source article. Each row is an article. Colunm[0] is the id of article. Column[1:] is the attributes of article. """ file_operator = TextFileOperator() data_list = file_operator.reading(article_path) entry_list = data_list[0] source_list = [] length = len(data_list[1:]) - 1 for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : article = dict() article['id'] = data[0] article['url'] = data[1] article['pub_time'] = data[2] article['title'] = data[3] article['content'] = data[4] article['n_zan'] = data[5] article['n_forward'] = data[6] article['n_click'] = data[7] article['n_collect'] = data[8] article['read_time'] = data[9] article['finish_rate'] = data[10] source_list.append(article) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) return source_list
def read_participle(self, articles, participle_path) : """ Read participle title. Each row is an article. Colunm[0] is the id of article. Column[1] is the word of participle title. Column[2] is the word of participle content. """ file_operator = TextFileOperator() data_list = file_operator.reading(participle_path) entry_list = data_list[0] article_dict = dict() length = len(data_list[1:]) - 1 for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : article = dict() article['id'] = data[0] article['participle_title'] = [Word(word) for word in data[1].split(' ')] article['participle_content'] = [Word(word) for word in data[2].split(' ')] article_dict[article['id']] = article if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) length = len(articles) - 1 for idx, article in enumerate(articles) : if article['id'] in article_dict : article['participle_title'] = article_dict[article['id']]['participle_title'] article['participle_content'] = article_dict[article['id']]['participle_content'] if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) return articles
def read_sentences(self, source_path, type='all') : """ Read participle sentences. Each row is a sentence. """ file_operator = TextFileOperator() data_list = file_operator.reading(source_path) entry_list = data_list[0] sentences = list() length = len(data_list[1:]) - 1 if type == 'all' : for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : sentence = [Word(word, sp_char=':').to_string() for word in data[0].split(' ')] sentences.append(sentence) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) elif type == 'name' : for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : sentence = [Word(word, sp_char=':').name for word in data[0].split(' ')] sentences.append(sentence) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) return sentences
def read_article(self, article_path) : """ Read source article. Each row is an article. Colunm[0] is the id of article. Column[1:] is the attributes of article. """ file_operator = TextFileOperator() data_list = file_operator.reading(article_path) entry_list = data_list[0] source_list = [] length = len(data_list[1:]) - 1 for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : article = dict() article['id'] = data[0] article['url'] = data[1] article['title'] = data[2] article['content'] = data[3] article['participle_title'] = [Word(word) for word in data[4].split(' ')] article['participle_content'] = [Word(word) for word in data[5].split(' ')] article['label'] = data[6] source_list.append(article) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) return source_list
def _read_dictionary(self, pos_path) : file_operator = TextFileOperator() data_list = file_operator.reading(pos_path) dictionary = dict() for data in data_list : if len(data) >= 1 : dictionary[data[0]] = 0 return dictionary
def run_tag_sentences(self, tag_tree_path, sentences_market_path, tags_path, untag_sentence_path) : file_operator = TextFileOperator() sentences = [u'技能贴 | 黑色打底裤的10种正确穿搭方式', u'春季男鞋韩版潮流行英伦男士休闲鞋'] cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) robot = Robot() tags, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:1000]) print 'finish'
def run_robot(self, tag_tree_path, sentences_market_path, tags_path) : robot = Robot() loader = PickleMarket() file_operator = TextFileOperator() cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) sentences = loader.load_market(sentences_market_path) tags = loader.load_market(tags_path) print 'start' string = raw_input().decode('gb18030') # string = u'我想要毛衣' sentences = robot.question_and_answer(string, sentences, tags, tag_tree)
def run_tag_sentences(self, tag_tree_path, sentences_market_path, tags_martket_path, dict_market_path) : file_operator = TextFileOperator() loader = PickleMarket() sentences = loader.load_market(sentences_market_path) cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) robot = Robot() tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:]) loader = JsonMarket() loader.dump_market(tags, tags_martket_path) loader.dump_market(tag_tree.dict_tuple, dict_market_path) print '%.2f%% article >= 1 tags, number is, %d.' \ % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences)) \ % len([tag for tag in tags_show if len(tag) >= 1])
class BaseSegementor : def __init__(self) : pass def _read_dictionary(self, split_path) : self.file_operator = TextFileOperator() data_list = self.file_operator.reading(split_path) split_dict = dict() for data in data_list : if len(data) >= 1 : if data[0] not in split_dict : split_dict[data[0]] = None return split_dict
def run_test(self, tag_tree_path, sentences_market_path, tags_path, \ tags_martket_path, untag_sentence_path) : file_operator = TextFileOperator() loader = PickleMarket() sentences = loader.load_market(sentences_market_path) cmd_list = file_operator.reading(tag_tree_path) tag_tree = TagTree(cmd_list) robot = Robot() tags, tags_show, untag_sentences = robot.tag_sentences(tag_tree, sentences[0:]) loader = JsonMarket() self.write_tags(sentences, tags_show, tags_path) loader.dump_market(tags, tags_martket_path) file_operator.writing(untag_sentences, untag_sentence_path) # loader.dump_market(untag_sentences, sentences_market_path) # print '%.2f%% article >= 2 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences)) print '%.2f%% article >= 3 tags' % (100.0 * len([tag for tag in tags_show if len(tag) >= 1]) / len(sentences))
def read_sentences(self, sentences_path) : """ Read participle sentences. Each row is a sentence. """ file_operator = TextFileOperator() data_list = file_operator.reading(sentences_path) entry_list = data_list[0] sentences = list() length = len(data_list[1:]) - 1 for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : sentence = data[0].upper() sentences.append(sentence) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) return sentences
def read_participle_title(self, title_path) : """ Read participle title. Each row is an article. Colunm[0] is the id of article. Column[1:] is the word of participle title. """ file_operator = TextFileOperator() data_list = file_operator.reading(title_path) entry_list = data_list[0] source_list = list() length = len(data_list[1:]) - 1 for idx, data in enumerate(data_list[1:]) : if len(data) >= len(entry_list) : article = dict() article['id'] = data[0] article['participle_title'] = [Word(word) for word in data[1].split(' ')] source_list.append(article) if idx % 100 == 0 : print 'finish rate is %.2f%%\r' % (100.0*idx/length), print 'finish rate is %.2f%%\r' % (100.0*idx/length) return source_list