Esempio n. 1
0
    def __init__(self,
                 database_path,
                 language_path,
                 json_output_path=None,
                 thesaurus_path=None,
                 stopwords_path=None,
                 color=False):
        if color == False:
            without_color()

        database = Database()
        self.stopwordsFilter = None

        if thesaurus_path:
            thesaurus = Thesaurus()
            thesaurus.load(thesaurus_path)
            database.set_thesaurus(thesaurus)
            #print(thesaurus.print_me())
            #print("Hello")

        if stopwords_path:
            self.stopwordsFilter = StopwordFilter()
            self.stopwordsFilter.load(stopwords_path)

        database.load(database_path)
        # database.print_me()

        config = LangConfig()
        config.load(language_path)

        self.parser = Parser(database, config)
        self.json_output_path = json_output_path
Esempio n. 2
0
def make_thesaurus():
    start = timer()
    print("setting up thesaurus")
    thesaurus = Thesaurus.Thesaurus()
    thesaurus.build_jaccard_word_pair_table()
    print("finished setting up thesaurus")
    end = timer()
    print(f"{end - start} seconds")
Esempio n. 3
0
def make_doc_term():
    start = timer()
    print("setting up doc_term_table")
    thesaurus = Thesaurus.Thesaurus()
    thesaurus.build_doc_term_table()
    print("finished setting up doc_term_table")
    end = timer()
    print(f"{end - start} seconds")
Esempio n. 4
0
    def __init__(
        self,
        textIn,
        thesaurus=Thesaurus(filename="./thesauruses/thesaurusA.pickle")):
        self.textOut = None
        self.thesaurus = thesaurus
        self.punctuation = ", . ? ! : ; -".split(" ")  #Add more ???

        self.textIn = textIn
Esempio n. 5
0
 def set_method_type(self, method_type):
     if method_type == '1':
         self._method_class = LocalMethod(self)
     elif method_type == '2':
         self._method_class = Thesaurus(self)
     elif method_type == '3':
         self._method_class = Wordnet(self)
     elif method_type == '4':
         self._method_class = MySpellCheker(self)
Esempio n. 6
0
class Article:

    encodings = ["utf-8", "cp932", "euc-jp", "iso-2022-jp", "latin_1"]

    tokenizer = Thesaurus('thesaurus.csv')

    def __init__(self, path):
        print(path)
        self.path = path
        self.contents = self.preprocess(self.get_contents(path))
        # self.contents = self.preprocess(self.get_title(path))
        self.tokens = [
            token.surface for token in self.tokenizer.tokenize(self.contents)
            if re.match("カスタム名詞|名詞,(固有|一般|サ変)", token.part_of_speech)
        ]

    # print(self.tokens)

    def get_contents(self, path):
        exceptions = []
        for encoding in self.encodings:
            try:
                all = codecs.open(path, 'r', encoding).read()
                parts = re.split("(?i)<(body|frame)[^>]*>", all, 1)
                if len(parts) == 3:
                    head, void, body = parts
                else:
                    print('Cannot split ' + path)
                    body = all
                return re.sub(
                    "<[^>]+?>", "",
                    re.sub(
                        r"(?is)<(script|style|select|noscript)[^>]*>.*?</\1\s*>",
                        "", body))
            except UnicodeDecodeError:
                continue
        print('Cannot detect encoding of ' + path)
        print(exceptions)
        return None

    def get_title(self, path):
        return re.split('\/', path)[-1]

    def preprocess(self, text):
        text = re.sub("&[^;]+;", " ", text)
        text = mojimoji.han_to_zen(text, digit=False)
        # text = re.sub('(\s| |#)+', " ", text)
        return text

    def dense(self, dictionary):
        values_set = set(dictionary.values())
        text = [token for token in self.tokens if token in values_set]
        corpus = dictionary.doc2bow(text)
        return matutils.corpus2dense([corpus], len(dictionary)).T[0]
Esempio n. 7
0
class MyApp():
	"""Class for a GUI """


	def __init__(self, parent,thes=""):
		""" initialize the GUI with all visible elements and menus """


		#self.MyParent of MyApp
		self.MyParent = parent

		# import a thesaurus if given as an argument or create an empty one
		if thes is "":
			self.t1=Thesaurus("Neuer Thesaurus")
		else:
			self.t1=thes
		self.MyParent.title("uberthesaurus - %s"% self.t1.name)
		# scrollbars for the listboxes
		self.scrollbar1 = Scrollbar(self.MyParent, orient=VERTICAL)
		self.scrollbar2 = Scrollbar(self.MyParent, orient=VERTICAL)
		# 2 listboxes for des and terms
		self.deslistbox = Listbox(self.MyParent, yscrollcommand=self.scrollbar1.set, exportselection=0)
		self.termlistbox = Listbox(self.MyParent, yscrollcommand=self.scrollbar2.set, exportselection=0)
		#self.termlistbox.bind("<<Double-Button-1>>", lambda event:self.deslistbox.select_set())
		self.deslistbox.bind("<<ListboxSelect>>", lambda event: self.update_tlist())

		# a frame for changing elements
		self.myContainer1 = Frame(self.MyParent)
		# add scrollbars for the listboxes
		self.scrollbar1.config(command=self.deslistbox.yview)
		self.scrollbar2.config(command=self.termlistbox.yview)
		#add buttons for interaktion with des
		self.add1_button=Button(self.MyParent, text='Hinzufuegen', command=self.add_des, width=10)
		self.edit1_button=Button(self.MyParent, text='Bearbeiten', command=self.edit_des, width=10)
		self.del1_button=Button(self.MyParent, text="Loeschen", command=self.del_des, width=10)
		#add buttons for interaktion with terms
		self.add2_button=Button(self.MyParent, text='Hinzufuegen', command=self.add_term, width=10)
		self.edit2_button=Button(self.MyParent, text='Bearbeiten', command=self.edit_term, width=10)
		self.del2_button=Button(self.MyParent, text="Loeschen", command=self.del_term, width=10)
		# confige the spacing
		self.MyParent.columnconfigure(1, weight=0)
		self.MyParent.columnconfigure(1, pad=0)
		self.MyParent.columnconfigure(2, pad=7)
		self.MyParent.columnconfigure(3, pad=7)
		self.MyParent.rowconfigure(1, weight=0)
		self.MyParent.rowconfigure(2, weight=0)
		self.MyParent.rowconfigure(3, weight=1)
		# place all GUI-elements
		self.add1_button.grid(row=1, column=0, pady=2, sticky=NW)
		self.edit1_button.grid(row=2, column=0, pady=2, sticky=NW)
		self.del1_button.grid(row=3, column=0, pady=2, sticky=NW)

		self.deslistbox.grid(row=1, column=1, rowspan=3,pady=5, sticky=NS)
		self.scrollbar1.grid(row=1, column=2, rowspan=3, pady=5, sticky=NS)
		self.termlistbox.grid(row=1,column=3, rowspan=3, pady=5, sticky=NS)
		self.scrollbar2.grid(row=1,column=4, rowspan=3, pady=5, sticky=NS)

		self.add2_button.grid(row=1,column=5, pady=2, sticky=NW)
		self.edit2_button.grid(row=2,column=5, pady=2, sticky=NW)
		self.del2_button.grid(row=3,column=5, pady=2, sticky=NW)
		# self.myContainer1.grid()

		# Menu
		self.menu = Menu(self.MyParent)
		self.MyParent.config(menu=self.menu)
		self.filemenu = Menu(self.menu)
		self.menu.add_cascade(label="Datei", menu=self.filemenu)
		# Main Menu
		self.filemenu.add_command(label="Neu", command=self.new_thes)
		self.filemenu.add_command(label="Verbinden", command=self.t1.connect)
		self.filemenu.add_command(label="Import", command=self.importdatei)
		self.filemenu.add_command(label="Export", command=self.export)
		self.filemenu.add_command(label="Schliessen", command=self.exit_prog)


	def update_dlist(self):
		""" Updates the listbox for the descriptors"""

		self.deslistbox.delete(0, END)
		for elem in sorted(self.t1.entries.keys()):
			self.deslistbox.insert(END, elem)


	def update_tlist(self):
		""" Updates the listbox for the relations and terms"""

		if self.t1.entries!={}:
			if self.deslistbox.curselection()!=():
				tlist=self.t1.entries[self.deslistbox.get(self.deslistbox.curselection())].get_terms()
			else:
				tlist=self.t1.entries[self.deslistbox.get(0)].get_terms()
			self.termlistbox.delete(0, END)
			for key,value in sorted(tlist.iteritems()):
				for elem in value:
					self.termlistbox.insert(END, key + " "+elem)
		else:
			self.termlistbox.delete(0, END)


	def del_des(self):
		""" Deletes the selected element of the listbox for the descriptors"""

		if self.deslistbox.curselection() != ():
			self.t1.delete_entries(self.deslistbox.get(self.deslistbox.curselection()))
			self.update_dlist()
			self.update_tlist()


	def add_des(self):
		""" Deletes the selected element of the listbox for the relations and terms"""

		self.des = tkSimpleDialog.askstring("Deskriptor hinzufuegen", "Deskriptor:")
		if self.des is not None:
			self.t1.create_entries(self.des)
			self.update_dlist()
			self.update_tlist()


	def edit_des(self):
		"""Opens up a dialog for descriptor editing"""

		self.des = tkSimpleDialog.askstring("Deskriptor bearbeiten", "Bearbeiten:")
		if self.des is not None:
			self.t1.edit_entries(self.deslistbox.get(self.deslistbox.curselection()),self.des)
			self.update_dlist()
			self.update_tlist()


	def del_term(self):
		""" Deletes the selected term from the termlist """

		if self.termlistbox.curselection() != ():
			self.term=self.termlistbox.get(self.termlistbox.curselection())
			self.term=self.term.split(" ")
			self.t1.entries[self.deslistbox.get(self.deslistbox.curselection())].remove_term(self.term[0],self.term[1])
			self.update_tlist()


	def add_term(self):
		"""Opens up a dialog for term adding"""

		self.term = tkSimpleDialog.askstring("Term hinzufuegen", "Rel Term:")
		if self.term is not None:
			self.term=self.term.split(" ")
			self.t1.add(self.deslistbox.get(self.deslistbox.curselection()), self.term[1], self.term[0])
			self.update_dlist()
			self.update_tlist()


	def edit_term(self):
		"""Opens up a dialog for term/rel editing"""

		self.rel_term=tkSimpleDialog.askstring("Term bearbeiten", "Rel Term")
		if self.rel_term is not None:
			self.rel_term=self.rel_term.split(" ")
			self.rel_term_old=self.termlistbox.get(self.termlistbox.curselection()).split(" ")
			if self.rel_term[0]!=self.rel_term_old[0]:
				self.t1.entries[self.deslistbox.get(self.deslistbox.curselection())].edit_rel(str(self.rel_term_old[0]), str(self.rel_term_old[1]), str(self.rel_term[0]))
			if self.rel_term[1]!=self.rel_term_old[1]:
				self.t1.entries[self.deslistbox.get(self.deslistbox.curselection())].edit_term(str(self.rel_term_old[0]), str(self.rel_term_old[1]), str(self.rel_term[1]))
			self.update_dlist()
			self.update_tlist()


	def exit_prog(self):
		"""Exits the program"""
		self.MyParent.destroy()


	def new_thes(self):
		"""Clears all entries of the thesaurus"""

		self.t1.entries={}
		self.update_dlist()
		self.update_tlist()
		self.t1.name="Neuer Thesaurus"
		self.MyParent.title("uberthesaurus - %s"% self.t1.name)


	def export(self):
		"""Extracts the filetype and calls the real export method if a valid filename is given"""

		self.formats = [
		('Comma-separated values','*.csv'),
		('JavaScript Object Notation','*.json'),
		('Extensible Markup Language','*.xml'),
    	]
		self.filename = asksaveasfilename(filetypes=self.formats, title="Den Thesaurus exportieren", defaultextension=".xml")
		if len(self.filename)>0:
			self.t1.export_thesaurus(self.filename)
			self.MyParent.title("uberthesaurus - %s"% self.t1.name)
		else:
			print "Keine Datei angegeben."


	def importdatei(self):
		"""Extracts the filetype and calls the real import method if a valid filename is given"""

		self.filename = askopenfilename()
		if len(self.filename)>0:
			self.t1.import_thesaurus(self.filename)
			self.update_dlist()
			self.MyParent.title("uberthesaurus - %s"% self.t1.name)
		else:
			print "Keine Datei angegeben."
Esempio n. 8
0
	def __init__(self, parent,thes=""):
		""" initialize the GUI with all visible elements and menus """


		#self.MyParent of MyApp
		self.MyParent = parent

		# import a thesaurus if given as an argument or create an empty one
		if thes is "":
			self.t1=Thesaurus("Neuer Thesaurus")
		else:
			self.t1=thes
		self.MyParent.title("uberthesaurus - %s"% self.t1.name)
		# scrollbars for the listboxes
		self.scrollbar1 = Scrollbar(self.MyParent, orient=VERTICAL)
		self.scrollbar2 = Scrollbar(self.MyParent, orient=VERTICAL)
		# 2 listboxes for des and terms
		self.deslistbox = Listbox(self.MyParent, yscrollcommand=self.scrollbar1.set, exportselection=0)
		self.termlistbox = Listbox(self.MyParent, yscrollcommand=self.scrollbar2.set, exportselection=0)
		#self.termlistbox.bind("<<Double-Button-1>>", lambda event:self.deslistbox.select_set())
		self.deslistbox.bind("<<ListboxSelect>>", lambda event: self.update_tlist())

		# a frame for changing elements
		self.myContainer1 = Frame(self.MyParent)
		# add scrollbars for the listboxes
		self.scrollbar1.config(command=self.deslistbox.yview)
		self.scrollbar2.config(command=self.termlistbox.yview)
		#add buttons for interaktion with des
		self.add1_button=Button(self.MyParent, text='Hinzufuegen', command=self.add_des, width=10)
		self.edit1_button=Button(self.MyParent, text='Bearbeiten', command=self.edit_des, width=10)
		self.del1_button=Button(self.MyParent, text="Loeschen", command=self.del_des, width=10)
		#add buttons for interaktion with terms
		self.add2_button=Button(self.MyParent, text='Hinzufuegen', command=self.add_term, width=10)
		self.edit2_button=Button(self.MyParent, text='Bearbeiten', command=self.edit_term, width=10)
		self.del2_button=Button(self.MyParent, text="Loeschen", command=self.del_term, width=10)
		# confige the spacing
		self.MyParent.columnconfigure(1, weight=0)
		self.MyParent.columnconfigure(1, pad=0)
		self.MyParent.columnconfigure(2, pad=7)
		self.MyParent.columnconfigure(3, pad=7)
		self.MyParent.rowconfigure(1, weight=0)
		self.MyParent.rowconfigure(2, weight=0)
		self.MyParent.rowconfigure(3, weight=1)
		# place all GUI-elements
		self.add1_button.grid(row=1, column=0, pady=2, sticky=NW)
		self.edit1_button.grid(row=2, column=0, pady=2, sticky=NW)
		self.del1_button.grid(row=3, column=0, pady=2, sticky=NW)

		self.deslistbox.grid(row=1, column=1, rowspan=3,pady=5, sticky=NS)
		self.scrollbar1.grid(row=1, column=2, rowspan=3, pady=5, sticky=NS)
		self.termlistbox.grid(row=1,column=3, rowspan=3, pady=5, sticky=NS)
		self.scrollbar2.grid(row=1,column=4, rowspan=3, pady=5, sticky=NS)

		self.add2_button.grid(row=1,column=5, pady=2, sticky=NW)
		self.edit2_button.grid(row=2,column=5, pady=2, sticky=NW)
		self.del2_button.grid(row=3,column=5, pady=2, sticky=NW)
		# self.myContainer1.grid()

		# Menu
		self.menu = Menu(self.MyParent)
		self.MyParent.config(menu=self.menu)
		self.filemenu = Menu(self.menu)
		self.menu.add_cascade(label="Datei", menu=self.filemenu)
		# Main Menu
		self.filemenu.add_command(label="Neu", command=self.new_thes)
		self.filemenu.add_command(label="Verbinden", command=self.t1.connect)
		self.filemenu.add_command(label="Import", command=self.importdatei)
		self.filemenu.add_command(label="Export", command=self.export)
		self.filemenu.add_command(label="Schliessen", command=self.exit_prog)
Esempio n. 9
0
            textSubstitutions.append(sentenceSubstitutions)
        return textSubstitutions

    def obfuscate(self, text):
        """Apply the functions in sequence to obfuscate the text"""
        functions = [
            self._tokenise, self._substituteSynonyms, self._untokenise
        ]
        temp = text
        for function in functions:
            temp = function(temp)
        return temp

    def __repr__(self):
        return "Text obfuscator on thesaurus: '{}', with replace factor: {}".format(
            self._thesaurus.filename, self._replaceFactor)


if __name__ == "__main__":
    seed(0)

    thesaurus = Thesaurus(filename="./thesauruses/thesaurusA.pickle")
    obfuscator = Obfuscator(thesaurus, replaceFactor=1)
    print(obfuscator)
    text = "The quick brown fox jumped over the lazy dog!"

    obfuscatedText = obfuscator.obfuscate(text)
    print(text)
    print(obfuscatedText)
Esempio n. 10
0
    ("jacoby", "N"),
]
testpair = (("kiwis", "N"), ("zealanders", "N"))


# simcache=False #whether file currently contains valid sims
k = 1000
kdisplay = 10

print(sys.argv)
Thesaurus.byblo = byblo  # take command line argument as to whether this is a byblo file or not
if metric == "cosine":
    compress = True
else:
    compress = False
mythes = Thesaurus(vectorfilename, simcachefile, simcache, windows, k, adja, adjb, compress)
mythes.readvectors()
# if simcache:
#    check=True
# else:
#    for wordA in words:
#        for wordB in words:
#            mythes.outputsim(wordA,wordB,metric)


(word1, word2) = testpair
if simcache == False:
    mythes.outputsim(word1, word2, metric)

mythes.allpairssims(metric)
Esempio n. 11
0
 def __init__(self, config=None):
     self._config = config
     self._parser = Parse(config)
     self._indexer = Indexer(config)
     self._model = Word2Vec()
     self._model_1 = Thesaurus()
Esempio n. 12
0
from thesaurus import Thesaurus
from sys import exit

thes = Thesaurus()

print('Type HELP to see a list of commands and instructions.\n')

while True:
    query = input('Enter a word to look it up in the thesaurus:\n')

    if query.strip() == 'HELP':
        print('QUIT'.ljust(25), 'Exits the program.')
        print('REQUIRE <word list(s)>'.ljust(25), 'Shows only results from these specific word list(s).')
        print('REQUIRE ANY'.ljust(25), 'Does not show words that are not in a word list.')
        print('RESET SETTINGS'.ljust(25), 'Resets to default settings.')
        print()
        print('To add a new word list, create a txt file in the filters directory with each word on a new line. Then restart the program.')

    elif query.strip() == 'QUIT':
        exit()

    elif query.strip().lower() == 'reset settings':
        thes = Thesaurus()
        print('Settings have been reset.')

    elif query.strip() == 'require any':
        thes = Thesaurus(must_match=True)

    elif len(query.strip().split()) > 1:

        # change required word list(s)
Esempio n. 13
0
 def init_logic_translator(self, logic_thesaurus_path):
     """Initialize logic translator"""
     self._logic_thesaurus = Thesaurus(logic_thesaurus_path, ["Logic"])
     jieba.load_userdict(logic_thesaurus_path)
Esempio n. 14
0
 def init_classification(self, customized_thesaurus_path):
     self._customized_thesaurus = Thesaurus(customized_thesaurus_path, ["Frequency", "Property"])
     jieba.load_userdict(customized_thesaurus_path)
Esempio n. 15
0
class TokenUtil(object):
    """Tokenizer would initialize the thesaurus and scan all utf-8 encoded vocabulary on the disk."""
    def __init__(self, general_thesaurus_path):
        """Read the general thesaurus"""
        jieba.initialize(general_thesaurus_path)

    def init_classification(self, customized_thesaurus_path):
        self._customized_thesaurus = Thesaurus(customized_thesaurus_path, ["Frequency", "Property"])
        jieba.load_userdict(customized_thesaurus_path)

    def get_keyword(self, content):
        seg_list = jieba.cut_for_search(content)
        customized_words = []
        for atoken in seg_list:
            if atoken in self._customized_thesaurus:
                customized_words.append(atoken)
        if len(customized_words) > 0:
            return {'Type': 'customized',
                    'Token': customized_words}
        else:
            return {'Type': 'general',
                    'Token': content}

    def init_logic_translator(self, logic_thesaurus_path):
        """Initialize logic translator"""
        self._logic_thesaurus = Thesaurus(logic_thesaurus_path, ["Logic"])
        jieba.load_userdict(logic_thesaurus_path)

    def logic_translate(self, content):
        """Translate the content to logic expression for Baidu Search Engine"""
        content = strdecode(content)
        result_list = []
        token_list = jieba.tokenize(content)
        has_logic = False
        for token in token_list:
            if token[0] in self._logic_thesaurus:
                has_logic = True
                result_list.append({"Type": self._logic_thesaurus.get_attr(token[0])["Logic"],
                                    "Content": token[0]})
            else:
                result_list.append({"Type": "Common",
                                    "Content": token[0]})
        if not has_logic:
            return {"Type": "general",
                    "Content": content}
        translate_finish = False
        or_list = []
        not_list = []
        and_list = []
        while(not translate_finish and len(result_list) > 0):
            for index in range(len(result_list)):
                if result_list[index]["Type"] == "NOT":
                    # 如果是NOT逻辑词
                    if index < len(result_list) - 1 and result_list[index + 1]["Type"] == "Common":
                        # 如果可以合并语句,则进行合并
                        not_list.append("-(" + result_list[index + 1]["Content"] + ")")
                        del result_list[index + 1]
                        del result_list[index]
                        break
                    else:
                        # 若不能合并语句,则将逻辑词视为普通词语
                        result_list[index]["Type"] = "Common"
                if result_list[index]["Type"] == "AND":
                    # 如果是AND逻辑词
                    if 0 < index < len(result_list) - 1 and result_list[index + 1]["Type"] == "Common" and\
                        result_list[index - 1]["Type"] == "Common":
                        and_list.append("(" + result_list[index - 1]["Content"] + " " +
                                        result_list[index + 1]["Content"] + ")")
                        del result_list[index + 1]
                        del result_list[index]
                        del result_list[index - 1]
                        break
                    else:
                        # 若不能合并语句,则将逻辑词视为普通词语
                        result_list[index]["Type"] = "Common"
                if result_list[index]["Type"] == "OR":
                    # 如果是OR连接词
                    if 0 < index < len(result_list) - 1 and result_list[index + 1]["Type"] == "Common" and\
                        result_list[index - 1]["Type"] == "Common":
                        or_list.append("(" + result_list[index - 1]["Content"] + " | " +
                                        result_list[index + 1]["Content"] + ")")
                        del result_list[index + 1]
                        del result_list[index]
                        del result_list[index - 1]
                        break
                    else:
                        # 若不能合并语句,则将逻辑词视为普通词语
                        result_list[index]["Type"] = "Common"
                if index >= len(result_list) - 1:
                    # 所有逻辑词处理完成
                    translate_finish = True
        result_content = " ".join([item["Content"] for item in result_list]) + " " + " ".join(and_list) + " " +\
                         " ".join(or_list) + " " + " ".join(not_list)
        if(result_content == ""):
            return {"Type": "general",
                    "Content": content}
        return {"Type": "logic",
                "Content": result_content}
Esempio n. 16
0
    parameters=conf.configure(sys.argv)

    inputfile="allBLESS-dependencies.json"
    inputpath=os.path.join(parameters['datadir'],inputfile)
    print inputpath
    with open(inputpath,'r') as instream:
        for line in instream:
            print line

    pairs = json.loads(inputpath)

    cluster0=[]
    cluster1=[]
    for (w1,w2,target) in pairs:
        if target==1:
            cluster1.append(w2)
        else:
            cluster0.append(w2)

    print len(cluster0), cluster0
    print len(cluster1),cluster1
    exit()


    words=["chicken","cricket","jaguar"]
    pos="N"

    mythes = Thesaurus("",parameters["simfile"],True,False,parameters["k"],1,1,False)
    mythes.readsomesims(words)
    for word in words:
        mythes.displayneighs((word,pos),100)
Esempio n. 17
0
'''
Created on Dec 4, 2012

@author: juliewe
'''

#test thesaurus class

from thesaurus import Thesaurus
from thesaurus import Entry
from thesaurus import Neighbours

filename='/Volumes/research/calps/data3/mlcl/DisCo/thesauri/exp4-11c.strings'

mythesaurus = Thesaurus("test")
#
e1 = Entry("cat/N","dog/N",0.8)
e2 = Entry("cat/N","ostrich/N",0.5)
e3 = Entry("dog/N","ostrich/N",0.6)

mythesaurus.addEntry(e1)
mythesaurus.addEntry(e2)
mythesaurus.addEntry(e3)



print "Similarity between cat and ostrich is ", mythesaurus.lookupSim("cat","ostrich")
print "Similarity between cat and bird is ", mythesaurus.lookupSim("cat","bird")

print "Number of entries:", Entry.entrycount
print "Number of neighbour sets:", Neighbours.entrycount
Esempio n. 18
0
 def __init__(self, synonym_file, input_file_1, input_file_2, tuple_size):
     self.thesaurus = Thesaurus(synonym_file)
     self.input_file_1 = input_file_1
     self.input_file_2 = input_file_2
     self.tuple_size = tuple_size
class Data:

    structured_answers = [
        AnswerStructure('Ola, tudo bem?', ['Oi tudo bem', 'Oi', 'Tudo bem']),
        AnswerStructure(
            'Um ser humano adulto possui entre 4 a 6 litros de sangue.', [
                'quantos litros de sangue uma pessoa tem ?',
                'qual a quantidade de sangue de uma pessoa adulta ?'
            ]),
        AnswerStructure('São retirados 450 mililitros numa doação de sangue.',
                        ['quantos litros de sangue doação ?']),
        AnswerStructure('Celebre frase de Renè Descartes.', [
            ' De quem e a famosa frase “ Penso , logo existo ” ?',
            'famosa frase “ Penso , logo existo ” ?',
            'frase “ Penso , logo existo ” ?', 'Penso , logo existo',
            'Penso logo existo'
        ]),
        AnswerStructure('O chuveiro elétrico foi inventado no Brasil.', [
            'De onde é a invenção do chuveiro elétrico ?',
            'invenção do chuveiro elétrico ?',
            'onde foi inventado o chuveiro elétrico ?',
            'que país inventou o chuveiro elétrico ?'
        ]),
        AnswerStructure(
            'Quem inventou o chuveiro elétrico foi o brasileiro Francisco Canho.',
            [
                'Quem inventou o chuveiro elétrico ?',
                'Que pessoa inventou o chuveiro elétrico ?'
            ]),
        AnswerStructure(
            'Vaticano e Russia são o menor e o maior país do mundo, respectivamente.',
            [
                'Qual o menor e o maior país do mundo ?',
                'menor e o maior país do mundo ?', 'menor e o maior país',
                'qual menor e o maior país ?', 'país', 'países'
            ]),
        AnswerStructure('Vaticano é o menor país do mundo.', [
            'Qual o menor país do mundo ?', 'menor país do mundo',
            'menor país', 'Qual o menor país ?'
        ]),
        AnswerStructure('Russia é o maior país do mundo.', [
            'Qual o maior país do mundo ?', 'maior país do mundo ?',
            'maior país', 'Qual o maior país ?'
        ]),
        AnswerStructure('João Goulart.', [
            'Qual o nome do presidente do Brasil que ficou conhecido como Jango ?',
            'Qual o nome do presidente Jango ?', 'nome do presidente Jango ?',
            'Jango ?', 'presidente conhecido como Jango'
        ]),
        AnswerStructure(
            'A velocidade da luz é de 299 792 458 metros por segundo.',
            ['velocidade da luz em m/s', 'qual a velocidade da luz em m/s ?']),
        AnswerStructure('A velocidade da luz é de 300.000 Km/s.', [
            'velocidade da luz em km/s', 'qual a velocidade da luz',
            'velocidade da luz', 'qual a velocidade da luz em km/s ?'
        ]),
        AnswerStructure('42.', [
            '6x9', '6 x 9',
            'qual a resposta para a vida , o universo e tudo mais ?',
            'vida universo e tudo mais', 'resposta para a vida',
            'resposta para o universo'
        ]),
    ]

    thesaurus = Thesaurus([
        Synonym(['voce', 'vc', 'oce', 'ce', 'vs', 'vossa senhoria']),
        Synonym([
            'tudo bem', 'Tudo bem com voce', 'Tudo bom', 'Sussa', 'De boas',
            'De boa', 'Suave', 'Beleza', 'Blz', 'Firmeza', 'Fmz', 'Como vai',
            'Como está', 'Na boa'
        ]),
        Synonym(['Ola', 'Oi', 'E ai', 'Dae']),
        Synonym(['sair', 'fim', 'terminar', 'tchau', 'xau', 'vlw flw', 'flw']),
        Synonym(['maior', 'mais grande', 'gigante', 'grande']),
        Synonym(['menor', 'mais pequeno', 'pequeno']),
        Synonym(['doacao', 'doar', 'doação', 'doaçao', 'doacão']),
        Synonym([
            'de onde é', 'de onde e', 'onde', 'em que país', 'em que pais',
            'em qual pais', 'em qual país', 'em que lugar'
        ]),
        Synonym(['a gente', 'nós']),
        Synonym(['pais', 'país']),
        Synonym(['paises', 'países']),
        Synonym([
            'invencao', 'invenção', 'invencão', 'invençao', 'criação',
            'criacao'
        ]),
        Synonym(['inventor', 'criador']),
        Synonym(['é', 'e']),
        Synonym(['qual a', 'qual é a', 'qual e a']),
        Synonym(['tem', 'possui']),
        Synonym(['ter', 'possuir']),
        Synonym([
            'quantidade', 'quantos', 'qual a quantidade', 'qual quantidade',
            'quanto'
        ]),
        Synonym(['elétrico', 'eletrico']),
        Synonym(['pessoa', 'ser humano adulto', 'pessoa adulta', 'adulto']),
        Synonym([
            'que pessoa', 'qual pessoa', 'quem', 'qual foi a pessoa',
            'quem foi que'
        ]),
        Synonym([
            'metros por segundo', 'm/s', 'metros/s', 'm/sec', 'metros/sec',
            'mt/s'
        ]),
        Synonym(['quilometros por segundo', 'km/s', 'kilometros por segundo']),
    ])

    words_to_be_ignored = frozenset([
        'da', 'de', 'do', 'a', 'o', 'um', 'uma', 'é', 'para', 'pra', 'com',
        'sem', '?', '.', ','
    ])