def __init__(self, filename, keys, name, sex, age, school, city, height, weight): self.filename = filename # 知识库路径 self.name, self.sex, self.age, self.school, self.city, self.height, self.weight = name, sex, age, school, city, height, weight # 机器人信息赋值 self.synonym = Synonym("synonym.csv") # 同义词库 self.controller = APIcontroller(keys) # API控制器 self.help = '你可以用这些小工具哦:\n' \ '1.@百科 查询名词\n' \ '2.@天气 地名\n' \ '3.@日期\n' \ '4.@笑话\n' \ '5.@新闻头条\n' \ '6.@微信精选\n' \ '7.@邮编查询 邮编\n' \ '8.@繁简火星文切换 句子\n' \ '9.@新华字典 字\n' \ '10.@成语词典 成语\n' \ '11.@QQ号码测吉凶 QQ号\n' \ '12.help\n' \ '要按格式来哦,不然我会当做闲聊的啦' # 机器人信息 self.info = None self.update_bot_info() self.not_ans = ['我好像没法理解你在说什么哦', '找不到答案哦'] # 自动回复数据表 if not os.path.exists(filename): with open(filename, 'w', encoding='UTF-8'): pass with open(filename, 'r', encoding='UTF-8') as file: self.database = list(csv.reader(file)) print('我醒了!\n' + self.help)
def __init__(self): """ here the main speech module is speech which is in the file here the dictionary modusle is PyDictionary which will be using """ self.speech = Speech() self.dictionary = PyDictionary() self.universal = Universal(self.speech) self.meaning = Meaning(self.speech) self.synonym = Synonym(self.speech) self.antonym = Antonym(self.speech)
class Bot: def __init__(self): """ here the main speech module is speech which is in the file here the dictionary modusle is PyDictionary which will be using """ self.speech = Speech() self.dictionary = PyDictionary() self.universal = Universal(self.speech) self.meaning = Meaning(self.speech) self.synonym = Synonym(self.speech) self.antonym = Antonym(self.speech) def speak(self): sent = self.speech.listen() print(sent) if 'meaning of' in sent: self.meaning.Start_Meaning(sent) elif 'synonyms' in sent: self.synonym.Start_Synonym(sent) elif 'antonyms' in sent: self.antonym.Start_Antonym(sent) else: if (self.universal.check(sent) == False): self.speech.speak("Invalid Response how can I help you") return sent
def classify_translate_pmi(pTrains,pTests): dict=CEDict() syn=Synonym() pmi=PMI() trains=[] tests=[] for label,p in pTrains: words= getTranlateFeaturesPMI(p,dict,pmi) trains.append(CDocument(label,words)) for label,p in pTests: words= getTranlateFeaturesPMI(p,dict,pmi) tests.append(CDocument(label,words)) return me_classify(trains,tests)
def classify_translate_simple(pTrains,pTests): dict=CEDict() syn=Synonym() # lm=LanguageModel() trains=[] tests=[] for label,p in pTrains: words=getTranlateFeatures(p,dict) trains.append(CDocument(label,words)) for label,p in pTests: words=getTranlateFeatures(p,dict) tests.append(CDocument(label,words)) return me_classify(trains,tests)
def blp_translate_syn(pTrains,pTests): dict=CEDict() syn=Synonym() trains=[] tests=[] for label,p in pTrains: words=getTranlateFeaturesBySyn(p,dict,syn) trains.append(CDocument(label,words)) for label,p in pTests: words=getTranlateFeaturesBySyn(p,dict,syn) tests.append(CDocument(label,words)) blp=BLP(trains+tests) blp.LP_Classify(trains,tests)
def blp_translate_lm(pTrains,pTests): dict=CEDict() syn=Synonym() lm=LanguageModel() trains=[] tests=[] for label,p in pTrains: words=getTranslateFeaturesByLM(p,dict,lm) trains.append(CDocument(label,words)) for label,p in pTests: words=getTranslateFeaturesByLM(p,dict,lm) tests.append(CDocument(label,words)) blp=BLP(trains+tests) blp.LP_Classify(trains,tests)
from synonym import Synonym ################## # mutable params # ################## top = 15 conf1 = 'configure.json' conf2 = 'configure_1.json' cilin = '../data/cilin.txt' # the script g1 = tf.Graph() g2 = tf.Graph() with g1.as_default(): syn1 = Synonym.load(conf1) with g2.as_default(): syn2 = Synonym.load(conf2) with open(cilin, 'r') as f, open('output.txt', 'w') as g: for line in f: line = line.strip().decode('utf8') words = line.split()[1:] tag = line.split()[0] more_syns = set(words) for word in words: with g1.as_default(): syns1 = syn1.generate_synonyms(word, top) with g2.as_default(): syns2 = syn2.generate_synonyms(word, top)
def suggestCandidates(self): # singletons only graph = self.graph ngrams = self.ngrams thesaurus = self.thesaurus # levenshtein config levenshteinWithin = self.levenshtein_within_score levenshteinAbove = self.levenshtein_above_score # hybrid jaccard config hybridJaccardAllowExact = self.hybridjaccard_allowexact_enable for q,d in ngrams.items(): keyword = q d["candidates"] = [] # SINGLETON if d["cardinality"] == 1: # singleton, direct node if self.direct_enable: for node in graph.nodes(): if graph.nodeMatch(node, keyword): synonym = Synonym(source='direct', indicator=keyword, content=keyword, score=1.0) d["candidates"].append(Candidate(referent=node, referentType='node', candidateType='direct', synonym=synonym)) # singleton, direct edge for edge in graph.edges(): if graph.edgeMatch(edge, keyword): synonym = Synonym(source='direct', indicator=keyword, content=keyword, score=1.0) d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType='direct', synonym=synonym)) # singleton, levenshtein node if self.levenshtein_enable: for node in graph.nodes(): try: (closest, away) = graph.nodeEditWithin(node, keyword, levenshteinWithin, above=levenshteinAbove) synonym = Synonym(source='levenshtein', indicator=keyword, content=closest, score=away) d["candidates"].append(Candidate(referent=node, referentType='node', candidateType='levenshtein', distance=away, synonym=synonym)) except TypeError: pass # singleton, levenshtein edge for edge in graph.edges(): try: (closest,away) = graph.edgeEditWithin(edge, keyword, levenshteinWithin, above=levenshteinAbove) synonym = Synonym(source='levenshtein', indicator=keyword, content=closest, score=away) d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType='levenshtein', distance=away, synonym=synonym)) except TypeError: pass # singleton, hybrid jaccard node if self.hybridjaccard_enable: for node in graph.nodes(): best = graph.nodeNearMatch(node, keyword, allowExact=hybridJaccardAllowExact) if best: synonym = Synonym(source='hybridJaccard', indicator=keyword, content=best) d["candidates"].append(Candidate(referent=node, referentType='node', candidateType='hybridJaccard', synonym=synonym)) # singleton, hybrid jaccard edge for edge in graph.edges(): best = graph.edgeNearMatch(edge, keyword, allowExact=hybridJaccardAllowExact) synonym = Synonym(source='hybridJaccard', indicator=keyword, content=best) if best: d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType='hybridJaccard', synonym=synonym)) # singleton, synonym if self.thesaurus: for synonym in thesaurus.generateSynonyms(keyword): content = synonym.content # singleton, synonym node for node in graph.nodes(): if graph.nodeMatch(node, content): d["candidates"].append(Candidate(referent=node, referentType='node', candidateType=synonym.source, synonym=synonym)) # singleton, synonym edge for edge in graph.edges(): if graph.edgeMatch(edge, content): d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType=synonym.source, synonym=synonym)) # MULTIWORD elif d["cardinality"] >= 2: if self.direct_enable: # multiword, direct for node in graph.nodes(): if graph.nodeMatch(node, keyword): synonym = Synonym(source='direct', indicator=keyword, content=keyword, score=1.0) d["candidates"].append(Candidate(referent=node, referentType='node', candidateType='direct', synonym=synonym)) for edge in graph.edges(): if graph.edgeMatch(edge, keyword): synonym = Synonym(source='direct', indicator=keyword, content=keyword, score=1.0) d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType='direct', synonym=synonym)) # TODO: multiword, levenshtein (or jaro_winkler, hj) # NIY # multiword, synonym for synonym in thesaurus.generateSynonyms(keyword): content = synonym.content for node in graph.nodes(): if graph.nodeMatch(node, synonym): d["candidates"].append(Candidate(referent=node, referentType='node', candidateType=synonym.source, synonym=synonym)) for edge in graph.edges(): if graph.edgeMatch(edge, synonym): d["candidates"].append(Candidate(referent=edge, referentType='edge', candidateType=synonym.source, synonym=synonym))
description= 'Outputs a rate of plagiarizations based on the number of N-tuples in the comparison file that appear in the control file, where the tuples are compared by accounting for synonyms provided in the synonym file.' ) parser.add_argument('-s', '--synonym_file', help='Synonym file path', required=True) parser.add_argument('-c', '--comparison_file', help='Comparison file path', required=True) parser.add_argument('-o', '--control_file', help='Control file path', required=True) parser.add_argument('-t', '--tuple_size', help='Size of comparison tuple', default=3, required=False) args = parser.parse_args() tuple_size = int(args.tuple_size) dictionary = Synonym(args.synonym_file).dictionary() control = File(args.control_file, dictionary, tuple_size) comparison = File(args.comparison_file, dictionary, tuple_size) print CalculateIntersection(control.synonymized_tuples, comparison.synonymized_tuples).rate_text()
def build_synonyms(self): constants = Constants() logging.info("Building synonyms started") for grammar in self.grammar_rules: if grammar.enabled == 'N': continue # load rule into parser so rhs terminal words can be extracted from lhs noun/verbs configuration_file = self._generate_path() + grammar.file feature_cfg = load_parser(configuration_file, trace=0) productions = feature_cfg.grammar().productions() for prod in productions: prod_str = str(prod) prod_lhs_rhs = prod_str.split('-> ') rhs_terminal = prod_lhs_rhs[1].replace(constants.single_quote, "") if not rhs_terminal.isalpha(): continue # ignore configured stopwords e.g. pg, r is_stopword = False for stopword in self.stopwords: if rhs_terminal == stopword.stopword: is_stopword = True if is_stopword: continue syntactic_category = '' wordnet_synset_type = '' if prod_str[0:2] == constants.syntactic_category_jj: syntactic_category = constants.syntactic_category_jj wordnet_synset_type = 's' elif prod_str[0:1] == constants.syntactic_category_n: syntactic_category = constants.syntactic_category_n wordnet_synset_type = 'n' elif prod_str[0:1] == constants.syntactic_category_v: syntactic_category = constants.syntactic_category_v wordnet_synset_type = 'n' elif prod_str[0:3] == constants.syntactic_category_nns: syntactic_category = constants.syntactic_category_nns wordnet_synset_type = 'n' # compile list of words from rules if syntactic_category != '': self.words.add( Word(syntactic_category, wordnet_synset_type, rhs_terminal, grammar.ref)) # compile synonym set from extracted word list for w in self.words: for syn in wordnet.synsets(w.word): for lemma in syn.lemmas(): if lemma.name() != w.word: synonym = Synonym(w.syntactic_category, w.word, lemma.name().lower(), w.grammar_ref) self.synonyms.append(synonym) logging.info("Building synonyms completed") try: file = open(constants.path_synonyms + constants.file_synonym_set, "w") file.write('grammar_ref' + ',' + 'syntactic_category' + ',' + 'rule word' + ',' + 'synonym' + '\n') for syn in self.synonyms: fileline = syn.grammar_ref + ',' + syn.syntactic_category + ',' + syn.word + ',' + \ syn.synonym + '\n' file.write(fileline) file.close() except EnvironmentError as err: logging.warning("Failed to write synonym set: {}".format(err))
def __init__(self, word_vec_file_path, right_content): self.comprehend_client = boto3.client('comprehend') self.word_type_name_list = ['NOUN'] self.score_dict = self.get_score_dict() self.right_content = right_content self.synonym = Synonym("./target/vec.txt", threshold_rate)
class FeatureExtract: def __init__(self, word_vec_file_path, right_content): self.comprehend_client = boto3.client('comprehend') self.word_type_name_list = ['NOUN'] self.score_dict = self.get_score_dict() self.right_content = right_content self.synonym = Synonym("./target/vec.txt", threshold_rate) @staticmethod def get_score_dict(): score_dict = dict() score_dict['184190001'] = 4 score_dict['184190003'] = 3 score_dict['184190010'] = 4 score_dict['184190020'] = 4 score_dict['184190045'] = 4 score_dict['184190058'] = 3 score_dict['184190071'] = 3.5 score_dict['184190081'] = 4 score_dict['184190109'] = 4 score_dict['184190151'] = 3.5 score_dict['184190170'] = 4 score_dict['184190177'] = 4 score_dict['184190189'] = 4 score_dict['184190199'] = 3.5 score_dict['184430141'] = 3.5 return score_dict def read_json_file(self, file_path): """ :param file_path: :return: """ with open(file_path, "r") as f: new_dict = json.load(f) return new_dict['results']['transcripts'][0]['transcript'] def create_word_dict(self, content): result = self.comprehend_client.detect_syntax(Text=content, LanguageCode='en') result = result['SyntaxTokens'] word_type_dict = dict() for item in result: tag_name = item['PartOfSpeech']['Tag'] if tag_name not in self.word_type_name_list: continue item_set = word_type_dict.get(tag_name) if item_set is None: item_set = set() item_set.add(item['Text'].lower()) word_type_dict[tag_name] = item_set for item in word_type_dict.items(): print('\t', item) return word_type_dict def read_all_file(self, file): """ item(content, word_count ,word_dis_count, word_type_dict ) :param file: :return: """ print(self.right_content) word_dict = self.create_word_dict(self.right_content) _word_dict_list = list() count = 0 for root, dirs, files in os.walk(file): for f in files: print('\n', os.path.join(root, f)) content = self.read_json_file(os.path.join(root, f)) if count > 10: continue count += 1 word_count = len(content.split(' ')) word_dis_count = len(set(content.split(' '))) print('word_count{} word_dis_count {}'.format( word_count, word_dis_count)) word_type_dict = (f.split('.')[0], word_count, word_dis_count, self.create_word_dict(content), content) _word_dict_list.append(word_type_dict) return _word_dict_list def get_sim_score(self, base_list, new_list): """ 获取相似度得分 :param base_list: :param new_list: :return: """ total_score = 0 for j in new_list: if j in base_list: total_score += 1.0 else: synonym_list = self.synonym.cal_item_sim(j) if synonym_list is None: continue for syn_word in synonym_list: if syn_word[0] in base_list: total_score += float(syn_word[1]) print('word {} - > syn_word{} score: {}'.format( j, syn_word, total_score)) break return float('%.2f' % total_score) def run(self): word_dict_list = self.read_all_file('./dataset') count_index = 0 base_item = self.create_word_dict(self.right_content) for word_type_name in self.word_type_name_list: print('-------------- {}---------------- '.format(word_type_name)) tmp_item = sorted(list(base_item[word_type_name])) score_dict = self.get_score_dict() for item in word_dict_list: words = sorted(list(item[3][word_type_name])) sim_score = self.get_sim_score(tmp_item, words) print('学生编号: {}\t 得分: {}\t 单词个数:{} 不重复单词个数:{} \t相似度{}'.format( item[0], score_dict[item[0]], item[1], item[2], sim_score)) count_index += 1
def main(): # 定义同义词库的存放路径 synonym_file_path = os.path.join('output', 'synonym.pkl') if not os.path.exists('output'): os.makedirs('output') # 载入/建立 同义词库 syn = Synonym() if os.path.exists(synonym_file_path): syn.load(synonym_file_path) print('载入同义词库完毕。共有{}组同义词\n'.format(len(syn.word2idx))) else: syn.add_synonym(os.path.join('synonym_data', '哈工大同义词林.txt')) print('添加同义词表 完毕。目前共有{}组同义词'.format(len(syn.word2idx))) syn.save(synonym_file_path) print('保存同义词库完毕。\n') # test test_word = ['开心', '系统', '啥', '拜倒', '是'] for word in test_word: print('{}的同义词是 {}'.format(word, syn.query_synonym(word))) print(syn.judge_synonym('开心', '高兴')) print(syn.judge_synonym('开心', '不开心'))
class MyBot: def __init__(self, filename, keys, name, sex, age, school, city, height, weight): self.filename = filename # 知识库路径 self.name, self.sex, self.age, self.school, self.city, self.height, self.weight = name, sex, age, school, city, height, weight # 机器人信息赋值 self.synonym = Synonym("synonym.csv") # 同义词库 self.controller = APIcontroller(keys) # API控制器 self.help = '你可以用这些小工具哦:\n' \ '1.@百科 查询名词\n' \ '2.@天气 地名\n' \ '3.@日期\n' \ '4.@笑话\n' \ '5.@新闻头条\n' \ '6.@微信精选\n' \ '7.@邮编查询 邮编\n' \ '8.@繁简火星文切换 句子\n' \ '9.@新华字典 字\n' \ '10.@成语词典 成语\n' \ '11.@QQ号码测吉凶 QQ号\n' \ '12.help\n' \ '要按格式来哦,不然我会当做闲聊的啦' # 机器人信息 self.info = None self.update_bot_info() self.not_ans = ['我好像没法理解你在说什么哦', '找不到答案哦'] # 自动回复数据表 if not os.path.exists(filename): with open(filename, 'w', encoding='UTF-8'): pass with open(filename, 'r', encoding='UTF-8') as file: self.database = list(csv.reader(file)) print('我醒了!\n' + self.help) def update_bot_info(self): # 机器人信息 self.info = [["你的名字是什么", "我是" + self.name], ["你叫什么", "我是" + self.name], ["你是什么人", "我是" + self.name], ["你的性别是什么", "我是" + self.sex + "的"], ["你是男的女的", "我是" + self.sex + "的"], ["你的年龄多大", "我" + self.age + "岁"], ["你多大", "我" + self.age + "岁"], ["你贵庚", "我" + self.age + "岁"], ["你几岁了", "我" + self.age + "岁"], ["你多老了", "我" + self.age + "岁"], ["你是什么学校的", "我是" + self.school + "的"], ["你哪个学校的", "我是" + self.school + "的"], ["你在哪个城市", "我在" + self.city], ["你在哪", "我在" + self.city], ["你身高多少", "我身高" + self.height + "cm"], ["你多高", "我身高" + self.height + "cm"], ["你多重", "我体重" + self.weight + "kg"], ["你体重多少", "我体重" + self.weight + "kg"]] # 输入 def ask(self, sentence): sentence = sentence.strip() ans = self.find(sentence) if ans != 0 and ans != 1: return ans return self.not_ans[ans] # 找答案 def find(self, sentence): if sentence == '': return '你怎么不说话呢' if sentence == 'help': return self.help if sentence[0] == '@': splits = sentence.split() if len(splits) > 1: res = self.controller.control(splits[0][1:], splits[1]) else: res = self.controller.control(splits[0][1:], '') if res is False: return 1 else: return res res = self.compare(sentence) if res is not False: return res search_names = [sentence] sentence = sentence.replace('啊', '').replace('哦', '').replace('嗯', '').replace('吧', '').replace('你', '').\ replace('我', '').replace('他', '').replace('她', '').replace('它', '').replace('是', '').replace('不', '') search_names = search_names + self.cut(sentence) # print(search_names) for word in search_names: text = self.controller.search(word) if text is not False: return '我猜你想问:' + text return 0 def compare(self, sentence): start = time.time() words = self.cut(sentence) # print(words) for word in [sentence] + words: sentences = self.synonym.replace(word, sentence) # print(sentences) for sen in sentences: # print(sen) for item in self.info + self.database: # print(item) # print(Similarity.get_cos_similarity(sen, item[0])) if time.time() - start > 30: return False if Similarity.get_cos_similarity(sen, item[0]) > 0.65: return item[1] return False # 分词 @staticmethod def cut(sentence): words = [i for i in jieba.cut(sentence, cut_all=True) if i != ''] # 分词结果 return words # 学习QQ聊天记录 def learn(self, filename): knowledge = [] with open(filename, 'r', encoding='utf-8') as file: line = '' while True: lastline = line line = file.readline() if not line: break if re.search('\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', line) is not None: text = self.__get_sentence(file) if text is False: continue if re.sub('\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '', line) != \ re.sub('\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', '', lastline): knowledge.append([text]) else: knowledge.append(knowledge.pop() + [text]) print(knowledge) """ self.add_all(knowledge) self.update_to_file() """ return True def __get_sentence(self, file): sentence = '' while True: line = file.readline().strip() if not line: break line = line.replace('[图片]', '').replace('[表情]', '').replace(',', ',') if line == '': break sentence = sentence + line if sentence != '': print(sentence) return sentence return False def add_all(self, knowledge): for i in range(len(knowledge) - 1): for q in knowledge[i]: for a in knowledge[i + 1]: self.add_one([q, a]) return True def add_one(self, new): self.database.append(new) return True def delete(self, index): try: return self.database.pop(index) except IndexError: return False def update(self, index, new): if index >= len(self.database): return False self.database[index] = new return True def update_to_file(self): if self.database is not None: with open(self.filename, 'w', newline='', encoding='UTF-8') as file: writer = csv.writer(file) for row in self.database: writer.writerow(row) return True return False