def read_file(self, filename): print("Reading lines...") # Read the file and split into lines lines = open('data/%s.txt' % (filename), encoding='utf-8').\ read().strip().split('\n') # Split every line into pairs and normalize pairs = [[seq_utils.normalizeString(s) for s in l.split('\t')][:2] for l in lines] pairs = self.filterPairs(pairs) source = Language() target = Language() for pair in pairs: if self.is_valid_pair(pair[0], pair[1]): source.addSentence(pair[0]) target.addSentence(pair[1]) print(f'Source language counted words: {source.n_words}') print(f'Target language counted words: {target.n_words}') return source, target, pairs
def setListItems(self): self.DefaultLanguage = "English" self.CurrentLanguage = xbmc.getLanguage() self.FolderLanguage = self.settings["LanguagePath"] self.DefaultFolderLanguage = "special://xbmc/language/" if not os.path.exists(xbmc.translatePath(self.FolderLanguage)): print "Folder language not exists! '%s'" % self.FolderLanguage self.FolderLanguage = self.DefaultFolderLanguage if not self.FolderLanguage.startswith("special"): for folder in ["profile", "home", "xbmc"]: special = "special://%s/" % folder self.FolderLanguage = self.FolderLanguage.replace( xbmc.translatePath(special), special) if (xbmc.translatePath("special://skin/") in xbmc.translatePath(self.FolderLanguage)): self.FolderLanguage = "special://skin/language/" self.FolderLanguage = self.FolderLanguage.replace("\\", "/").rstrip("/") self.setContainerProperties() # get languages source self.language = Language(self) self.listitems = self.language.listitems
def main(): languages = Language() requirements = Requirement(languages.language_dict) while True: language_name = languages.set_language_name() languages.write_dict_language() while True: requirements.set_requirement_name(language_name) requirements.write_requirements_name() print('Czy chcesz wyjść? T/N') c = input() if c == 'T' or c == 't': break elif c == 'N' or c == 'n': continue else: print('Coś poszło nie tak.') print('Czy chcesz wyjść? T/N') c = input() if c == 'T' or c == 't': break elif c == 'N' or c == 'n': continue else: print('Coś poszło nie tak.')
def NV_extract(self): NList = [] VList = [] for i in range(1, self.s.nrows): print i noenc = self.delete_unnecc(i) lan = Language(noenc) if len(lan.str) > 4000: continue word = lan.getMorpheme() tmpNoun = u"" NN = 0 for j, line in enumerate(lan.getMorpheme()): if line[1] == u"動詞": if line[7] == u"する": VList.append(word[j - 1][0] + line[7]) else: VList.append(line[7]) if line[1] == u"名詞" and j != len(word) - 1: if word[j + 1][1] == u"名詞": tmpNoun += line[0] NN += 1 if NN > 3: tmpNoun = u"" NN = 0 continue elif word[j + 1][7] != u"する": NList.append(tmpNoun + line[0]) tmpNoun = u"" NN = 0 return NList, VList
def test_isVerbSubjunctive(self): self.h = Verb(Language() ) self.assertEqual(self.h.isVerbSubjunctive("xoxo"), False) self.assertEqual(self.h.isVerbSubjunctive("xoxox"), False) self.assertEqual(self.h.isVerbSubjunctive("oxooxo"), True) self.assertEqual(self.h.isVerbSubjunctive("xxoxoxs"), False) self.assertEqual(self.h.isVerbSubjunctive("oxoxoxo"), True)
def __init__(self): print("Messenger Running...") self.debug = True self.username = "******" self.password = "******" # Poll frequency (in seconds) self.timer = 8 self.language = Language()
def __init__(self): self.language=Language() self.box = Grid2D(50, 10) self.dictionary = Random_Word_Generator() self.dictionary.load_dictionary() self.game = Game(self.box,self.language) self.game.choose_generator(self.dictionary) self.music=Music() self.game.connect_music(self.music)
def __init__(self, world, color, capital): self.world = world self.color = color self.capital = capital self.settlements = [capital] self.language = Language() self.name = self.language.genName() self.leader = self.genLeader()
def __init__(self, filename, erease=False, Language=Language()): if filename is None: logging.warning("Fichier de sauvegarde du dictionnaire invalide") sys.exit() self._filename = filename logging.info("loading dictionary") self._dico = AlphabeticTree(filename, erease) logging.info("dictionary loaded") self._Language = Language
def descr_from_content(content, var_name, language=Language('python')): """ Returns the description of the vector var_name with respect to the content Parameters ---------- content : str Content of the string ("xyz", "xy", "z", ...) (no rpY) var_name : str Name of the vector you want to describe language : Language.Language Language of the generated docstring (used for vector subscription) Defaults to Language('python') Returns ------- description : str Description of the vector Examples -------- >>> my_description = descr_from_content("xyz", "Xd") >>> print(my_description) This is a (3 x 1) numpy.ndarray where : - Xd[0] is the x coordinate of Xd, - Xd[1] is the y coordinate of Xd, - Xd[2] is the z coordinate of Xd >>> from Language import Language >>> my_description = descr_from_content("xz", "Xd", Language('Julia')) >>> print(my_description) This is a (2 x 1) Matrix where : - Xd[1] is the x coordinate of Xd, - Xd[2] is the z coordinate of Xd >>> from Language import Language >>> my_description = descr_from_content("z", "Xd", Language('MATLAB')) >>> print(my_description) This is a (1 x 1) double where : - Xd(1) is the z coordinate of Xd """ descr = f"This is a ({len(content)} x 1) {language.matrix_type} where :\n" for i, c in enumerate(content): qp = language.slice_mat(var_name, i, None, None, None) coma = ",\n" if i < len(content) - 1 else "" descr += f" - {qp} is the {c} coordinate of {var_name}{coma}" return descr
def extract_terms(self, case_df): Noun_comp = u"" wakachi = u"" preR_id = 1 terms = [] documents = [] for (Report_id, frame) in zip( case_df[u"報告書_id"], case_df. loc[:, [u"主体", u"起点", u"対象", u"状況", u"着点", u"手段", u"関係", u"動詞"]]. values): #if Report_id>100:break if preR_id != Report_id: documents.append(wakachi) print Report_id #print wakachi wakachi = u"" if frame[7][-2:] != u"する": wakachi += frame[7] + u" " if frame[7] not in terms: terms.append(frame[7]) else: wakachi += frame[7][:-2] + u" " if frame[7][:-2] not in terms: terms.append(frame[7][:-2]) for i in range(0, 7): if frame[i] == u' ': continue Lan = Language(frame[i]) outList = Lan.getMorpheme() Mor_1 = [outList[i][1] for i in range(len(outList))] # if (u"接続詞" in Mor_1) | (u"記号" in Mor_1): # continue for mi, Mor in enumerate(outList): if Mor_1[mi] == u"名詞" and Mor[2] != u"形容動詞語幹": Noun_comp += Mor[0] if mi < len(Mor_1) - 1: if Mor_1[mi + 1] != u"名詞": wakachi += Noun_comp + u" " if Noun_comp not in terms: terms.append(Noun_comp) Noun_comp = u"" else: wakachi += Noun_comp + u" " if Noun_comp not in terms: terms.append(Noun_comp) Noun_comp = u"" elif Mor_1[mi] != u"助詞" and Mor_1[mi] != u"助動詞" and Mor[ 5] != u"サ変・スル" and Mor[2] != u"接尾": wakachi += Mor[0] + u" " if Mor[0] not in terms: terms.append(Mor[0]) preR_id = Report_id documents.append(wakachi) return terms, documents
def robot_dof_descr(robot, var_name, language=Language('python')): """ Returns the description of the vector containing all the degrees of freedom of the robot. Parameters ---------- robot : robots.Robot Robot you want to describe var_name : str Name of the vector variable language : Language.Language Language of the generated docstring (used for vector subscription) Defaults to Language('python') Returns ------- description : str Description of the vector Examples -------- >>> from robots import RobotURDF >>> from URDF import URDF >>> from Language import Language >>> rob = RobotURDF(URDF("./Examples/example_0.urdf")) >>> print(robot_dof_descr(rob, 'q', Language("julia"))) This is a (3 x 1) Matrix containing the state of all the joints where : - q[1] = theta_joint1 : Rotation value (in radians) around the joint1 joint axis. - q[2] = theta_joint2 : Rotation value (in radians) around the joint2 joint axis. - q[3] = theta_joint3 : Rotation value (in radians) around the joint3 joint axis. """ params = get_parameters(robot.dof) descrq = (f"This is a ({len(robot.dof)} x 1) {language.matrix_type} " f"containing the state of all the joints where :\n") for i_p, param in enumerate(params): qp = language.slice_mat(var_name, i_p, None, None, None) descrq += f' - {qp} = {param["name"]}' descrq += f' :\n {param["description"]}\n' return descrq[:-1]
def tableGUI(): langPath=loadGUI() try: lang = Language(langPath) except Exception: showMessage("The Chosen file in not a language file , try again") def submit(): Action(lang, int(numOfCluster.get()),(stream.get(),addDB.get(),cluster.get())) window = Tk() window.geometry("600x300") window.resizable(0, 0) window.title("NAC - Loaded News Articles") mylabel = Label(window, text="Chosen language path :") mylabel.place(x=0, y=0, height=20, width=600) #Test mylabel = Label(window, text="Current stored articles : {0}".format(len(lang.getArticles()))) mylabel.place(x=0, y=40, height=20, width=600) Label(window, text=langPath).place(x=0, y=20, height=20, width=600) stream = IntVar() Checkbutton(window, text="Stream", variable=stream).place(x=0, y=60, height=20, width=600) addDB = IntVar() Checkbutton(window, text="Add To DataBase", variable=addDB).place(x=0, y=80, height=20, width=600) cluster = IntVar() Checkbutton(window, text="Cluster", variable=cluster).place(x=0, y=100, height=20, width=600) Label(window, text="Number of clusters:").place(x=0, y=180, height=20, width=600) numOfCluster = Entry(window) numOfCluster.insert(10,"2") numOfCluster.place(x=0, y=200, height=20, width=600) Button(window, text='Quit', command=window.destroy).place(x=0, y=270, height=20, width=600) Button(window, text='Submit', command=submit).place(x=0, y=250, height=20, width=600) window.mainloop()
def main(): with open('datas/ZCloseness_centralities_results.txt', 'a') as result_file: result_file.write('Language | original cc | random cc | p_value \n') result_file.flush() for language_file in os.listdir('datas')[:5]: result_file.write(' {}'.format(language_file.split('_')[0])) result_file.flush() current_language = Language(language_file) original_graph = current_language.graph() #run the centrality calculation monte_carlo_centrality(original_graph, 150, 'switching', 20, 'decrease', result_file=result_file)
def to_class(self, Noun, Verb): if Noun in self.Nclass.keys(): Nclasslist = self.Nclass[Noun] else: lan = Language(Noun) word = lan.getMorpheme() Noun_tail = word[len(word)-1][0] if Noun_tail in self.Nclass.keys(): Nclasslist = self.Nclass[Noun_tail] else: Nclasslist = [u"未登録"] if Verb in self.Vclass.keys(): Vclasslist = self.Vclass[Verb] else: Vclasslist =[u"未登録"] #print Verb NV = [] for NVclass in itertools.product(Nclasslist,Vclasslist): NV.append(NVclass) return NV
def __init__(self): threading.Thread.__init__(self) self.datafile = open("w_report.csv", 'r') self.key = "http://api.openweathermap.org/data/2.5/weather?q=Kanchipuram&APPID=d500d89c5c31bf5d4e165dbaa8024895" self.last_update = [] self.L = Language() self.Structure = nx.Graph() self.Structure.add_edges_from([('weather', 'description'), ('weather', 'haze'), ('wind', 'deg'), ('wind', 'speed'), ('main', 'humidity'), ('main', 'temp'), ('main', 'pressure'), ('main', 'temp_max'), ('main', 'temp_min'), ('obj', 'dt'), ('obj', 'main'), ('obj', 'weather'), ('obj', 'wind')]) self.Structure.add_edges_from([('sys', 'sunrise'), ('sys', 'sunset'), ('obj', 'sys'), ('obj', 'visibility'), ('obj', 'coord'), ('obj', 'clouds'), ('clouds', 'all'), ('obj', 'name'), ('weather', 'id'), ('weather', 'main')]) nx.draw_networkx(self.Structure, show_labels=True)
def test_isVerb(self): self.h = Verb( Language() ) self.assertEqual(self.h.isVerb("xoxo"), False) self.assertEqual(self.h.isVerb("xoxox"), False) self.assertEqual(self.h.isVerb("xoxoxo"), True) self.assertEqual(self.h.isVerb("xoxoxs"), False)
def test_isInLanguage(self): self.h = Language() self.assertEqual(self.h.isInLanguage("xoxox"), True)
def __init__(self): self.lObj = Language() self.vObj = Verb(self.lObj) self.pObj = Preposition(self.lObj) self.nObj = Numbers(self.vObj, self.pObj)
def __init__(self, n_len): self.core = Language(path, n_len)
def TNoun_extract(self, tripleFrame, NV_class): TW = TWordclass() unify_particle= lambda x: TW.Particle_to.get(x, x) tripleFrame[u"助詞"] = tripleFrame[u"助詞"].map(unify_particle) triple_Treport = [[] for i in range(len(tripleFrame.columns) + 1)] for R_id, S_id, V_id, noun, particle, verb in zip(tripleFrame[u"報告書_id"], tripleFrame[u"文_id"], tripleFrame[u"動詞_id"], tripleFrame[u"名詞"], tripleFrame[u"助詞"], tripleFrame[u"動詞"], ): #if id >300: break #print noun, particle, verb print "Extracting triple_Treport:", R_id, S_id, V_id Lan = Language(noun) outList = Lan.getMorpheme() Mor_1 = [outList[i][1] for i in range(len(outList))] Mor_2 = [outList[i][2] for i in range(len(outList))] noun_comp_tmp = u"" noun_comp = [] noun_tail = [] noun_Pos1 = [] noun_Pos2 = [] for mi, Pos in enumerate(Mor_1): if mi==len(Mor_1)-1: noun_comp.append(noun_comp_tmp + outList[mi][0]) noun_tail.append(outList[mi][0]) noun_Pos1.append(outList[mi][1]) noun_Pos2.append(outList[mi][2]) break if Pos==u"名詞": if Mor_1[mi+1]==u"名詞": noun_comp_tmp += outList[mi][0] else: noun_comp.append(noun_comp_tmp+outList[mi][0]) noun_tail.append(outList[mi][0]) noun_Pos1.append(outList[mi][1]) noun_Pos2.append(outList[mi][2]) noun_comp_tmp = u"" TNneed = False TVneed = False #トライボロジーに関係する名詞か判定 for cni, nounMor in enumerate(noun_comp): if noun_Pos2[cni] == u"代名詞" : TNneed = True break if nounMor in NV_class[0].keys(): noun_target = nounMor elif noun_tail[cni] in NV_class[0].keys(): noun_target = noun_tail[cni] else: continue for Nclass in NV_class[0][noun_target]: if Nclass in TW.TNounclass_all: TNneed = True break elif Nclass in TW.TNounclass_Nopart.keys(): TNneed = True for TNoun_Nopart in TW.TNounclass_Nopart[Nclass]: if TNoun_Nopart in noun_target: TNneed = False elif Nclass==u"様相" and noun_Pos2[cni]==u"形容動詞語幹": TNneed = False elif Nclass in TW.TNounclass_part.keys(): for TNoun_part in TW.TNounclass_part[Nclass]: if TNoun_part in noun_target: TNneed = True break else: continue #トライボロジーに関係する動詞か判定 if TNneed: if verb in NV_class[1].keys(): for Vclass in NV_class[1][verb]: if Vclass in TW.TVerbclass_all: TVneed = True break elif Vclass in TW.TVerbclass_Nopart.keys(): TVneed = True for TVerb_Nopart in TW.TVerbclass_Nopart[Vclass]: if TVerb_Nopart in verb: TVneed = False elif Vclass in TW.TVerbclass_part.keys(): for TVerb_part in TW.TVerbclass_part[Vclass]: if TVerb_part in verb: TVneed = True break else: continue if TNneed and TVneed: #並列している名詞の分解 Mor_connect = [[u"接続詞"],[u"読点", u"並立助詞", u"接続助詞"]] if set(Mor_connect[0]).intersection(set(Mor_1)) or set(Mor_connect[1]).intersection(set(Mor_2)): noun_con = u"" for oi, out in enumerate(outList): if outList[oi][1] not in Mor_connect[0] and outList[oi][2] not in Mor_connect[1]: if out[0]!=u"等": noun_con += out[0] else: #print out[0], noun_con triple_Treport[0].append(R_id) triple_Treport[1].append(S_id) triple_Treport[2].append(V_id) triple_Treport[3].append(noun_con) triple_Treport[4].append(particle) triple_Treport[5].append(verb) noun_con = u"" continue if oi==len(outList)-1: triple_Treport[0].append(R_id) triple_Treport[1].append(S_id) triple_Treport[2].append(V_id) triple_Treport[3].append(noun_con) triple_Treport[4].append(particle) triple_Treport[5].append(verb) noun_con = u"" else: triple_Treport[0].append(R_id) triple_Treport[1].append(S_id) triple_Treport[2].append(V_id) triple_Treport[3].append(noun) triple_Treport[4].append(particle) triple_Treport[5].append(verb) triple_Treportdict = { tripleFrame.columns[0]: triple_Treport[0], tripleFrame.columns[1]: triple_Treport[1], tripleFrame.columns[2]: triple_Treport[2], tripleFrame.columns[3]: triple_Treport[3], tripleFrame.columns[4]: triple_Treport[4], tripleFrame.columns[5]: triple_Treport[5], } tripleFrame_Treport = DataFrame(triple_Treportdict, columns=[i for i in tripleFrame.columns]) fvu = lambda x: TW.Verb_unify.get(x, x) tripleFrame_Treport[u"動詞"] = tripleFrame_Treport[u"動詞"].map(fvu) return tripleFrame_Treport
def bunrui_frame(self, case_df, terms, idf_Treport, dist_method, threshould_dist): MorList = [] Noun_comp = u"" Noun_weight = 2.0 idf_Treport = Series(idf_Treport) zero = min(idf_Treport) if zero==0: min_idf=1.0 for idf in idf_Treport: if idf<min_idf and idf!=zero: min_idf=idf idf_Treport[idf_Treport == 0] = min_idf*0.5 for frame in case_df[u"事象"].drop_duplicates().values: MorList_tmp = {} for i, words in enumerate(frame.split(u" ")): #print words, u":" if i==len(frame.split(u" "))-1: if words[-2:] != u"する": MorList_tmp[words] = idf_Treport[terms.index(words)] else: MorList_tmp[words[:-2]] = idf_Treport[terms.index(words[:-2])] #エラー回避用xm Noun_comp = u"" else: Lan = Language(words) outList = Lan.getMorpheme() Mor_1 = [outList[i][1] for i in range(len(outList))] for mi, Mor in enumerate(outList): if Mor_1[mi] == u"名詞" and Mor[2] != u"形容動詞語幹": Noun_comp += Mor[0] if mi < len(Mor_1) - 1: if Mor_1[mi + 1] != u"名詞": #print Noun_comp, MorList_tmp[Noun_comp] = idf_Treport[terms.index(Noun_comp)] * Noun_weight Noun_comp = u"" else: #print Noun_comp MorList_tmp[Noun_comp] = idf_Treport[terms.index(Noun_comp)] * Noun_weight Noun_comp = u"" elif Mor_1[mi] != u"助詞" and Mor_1[mi] != u"助動詞" and Mor[5] != u"サ変・スル" and Mor[2] != u"接尾": #print Mor[0] MorList_tmp[Mor[0]] = idf_Treport[terms.index(Mor[0])] MorList.append(MorList_tmp) #caseFrame = case_df[u"主体"] + u" " + case_df[u"起点"] + u" " + case_df[u"対象"] + u" " + case_df[u"状況"] + u" " + case_df[ # u"着点"] + u" " + case_df[u"手段"] + u" " + case_df[u"関係"] + u" " + case_df[u"動詞"] cf = [i for i in case_df[u"事象"].drop_duplicates()] #Jaccard係数による統一辞書の作成 Wdist_index = [] Wdist_column = [] Wdist = [] unifyList = {} Case_freq = Counter(case_df[u"事象"]) # 各文の動詞が反対語リストに含まれていれば分類しない oppositeList = [u"良好", u"正常", u"低下"] print len(cf), len(MorList) for i, x in enumerate(MorList): print u"calculating distance... ", i x_keys_set = set(x.keys()) for j, y in enumerate(MorList[i + 1:]): j = i + j + 1 y_keys_set = set(y.keys()) # print j if bool(set(oppositeList).intersection(x_keys_set)) and not(bool(y_keys_set.issuperset(set(oppositeList).intersection(x_keys_set)))): continue elif bool(set(oppositeList).intersection(y_keys_set)) and not(bool(x_keys_set.issuperset(set(oppositeList).intersection(y_keys_set)))): continue if (x.keys() in unifyList.keys()) | (y.keys() in unifyList.keys()): continue if bool(x_keys_set.intersection(y_keys_set)) and cf[i] != cf[j]: sym = x_keys_set.symmetric_difference(y_keys_set) if len(sym)>3: continue #排他的論理和形態素に名詞が(2つ以上)含まれていてはいけない(時間かかる) ''' Mor_sd = [(Language(sdm).getMorpheme().pop()[1], Language(sdm).getMorpheme().pop()[2]) for sdm in sym] for Mor_set in [(u"名詞", u"サ変接続"), (u"名詞", u"形容動詞語幹")]: while Mor_set in Mor_sd: Mor_sd.remove(Mor_set) if [ms[0] for ms in Mor_sd].count(u"名詞")<2: ''' xy_set = dict(x.items() + y.items()) xy_insec = x_keys_set.intersection(y_keys_set) w_all = 0.00 if dist_method == u"Jaccard": #jaccard係数 for mor_val in xy_set.values(): w_all += mor_val elif dist_method == u"Simpson": #Simpthon係数 if len(x.keys())<len(y.keys()): for mor_val in x.values(): w_all += mor_val else: for mor_val in y.values(): w_all += mor_val w_insec = 0.00 for mor_val in xy_insec: w_insec += xy_set[mor_val] dist_str = w_insec / w_all if dist_str >= threshould_dist: ''' if dist_method==u"Jaccard": #頻度が高い格フレームに統一 if Case_freq[cf[j]] <= Case_freq[cf[i]] and cf[i] not in unifyList.keys(): unifyList[cf[i]] = cf[j] elif Case_freq[cf[j]] > Case_freq[cf[i]] and cf[j] not in unifyList.keys(): unifyList[cf[j]] = cf[i] elif dist_method==u"Simpson": ''' #形態素数が少ない格フレームに統一 if len(x_keys_set) < len(y_keys_set) and cf[j] not in unifyList.keys(): unifyList[cf[j]] = cf[i] elif len(x_keys_set) > len(y_keys_set) and cf[i] not in unifyList.keys(): unifyList[cf[i]] = cf[j] elif len(cf[i])<len(cf[j]) and cf[j] not in unifyList.keys(): unifyList[cf[j]] = cf[i] elif len(cf[i])>=len(cf[j]) and cf[i] not in unifyList.keys(): unifyList[cf[i]] = cf[j] #print "%d:%s" % (i, cf[i]), "%d:%s" % (j, cf[j]), dist_str, w_insec, w_all #print outList[len(outList) - 1][0], outList[len(outList) - 1][1], outList[len(outList) - 1][2] Wdist_index.append(cf[i]) Wdist_column.append(cf[j]) Wdist.append(dist_str) Wdist = DataFrame(Wdist, index=[Wdist_index, Wdist_column], columns=[u"Similarity"]) fnc = lambda x: unifyList.get(x, x) insecset = set() while set(case_df[u"事象"]).intersection(set(unifyList.keys())): if insecset== set(case_df[u"事象"]).intersection(set(unifyList.keys())): break case_df[u"事象"] = case_df[u"事象"].map(fnc) insecset = set(case_df[u"事象"]).intersection(set(unifyList.keys())) return case_df, Wdist
def table1(): print(" N | E | <k> | delta ") for input_file in os.listdir("datas"): current_language = Language(input_file) print(current_language.characteristics())
def test_isNumber(self): self.h = Numbers(Verb(Language() ),Preposition(Language()) ) self.assertEqual(self.h.isNumber("gxjrc"), True) self.assertEqual(self.h.isNumber("pwdood"), False)
def Section_div(self, case_df, VC_Dc, thresold_perD): Record_id = dict() # 文_id:レコード_id Record_id[(case_df.ix[0, :][u"報告書_id"], case_df.ix[0, :][u"文_id"])] = 0 tail_key = -1 for Report_id in case_df[u"報告書_id"].drop_duplicates(): print u"Extracting Sec_id:", Report_id Noun_pre = dict() # print Report_id case_df_perR = case_df[case_df[u"報告書_id"] == Report_id] for first_Sen, Sentence_id in enumerate(case_df_perR[u"文_id"].drop_duplicates()): for line in case_df_perR[case_df[u"文_id"] == Sentence_id].iterrows(): # print line[1][1] # print line[1][3] if line[1][1] not in Noun_pre.keys(): Noun_pre[line[1][1]] = [l for l in line[1][4:11].values if l != u" "] else: Noun_pre[line[1][1]] = Noun_pre[line[1][1]] + [l for l in line[1][4:11].values if l != u" "] # 代名詞の補完 for di, l in enumerate(line[1][4:11].values): if l != u" ": Lan = Language(l) outList = Lan.getMorpheme() if set([u"代名詞"]).intersection(set([outList[i][2] for i in range(len(outList))])): # ''' #代名詞の出力ベクトルと名詞の出力ベクトルのユークリッド距離が最小の名詞を選択 pronoun_vec = [np.array(out_perD[1]) for out_perD in self.Dc.predict(l, u"", line[1][3]) if np.argmax(np.array(out_perD[1])) == di] if len(pronoun_vec) == 0: pronoun_vec = [np.array(out_perD[1]) for out_perD in self.Dc.predict(l, u"", line[1][3])] Noun_out = [ {Np: [np.array(output[1]) for output in self.Dc.predict(Np, u"", line[1][3])] for Np in Np_list if Np not in line[1][4:11].values} for Np_list in [Noun_pre[line[1][1] - pre_i] for pre_i in range(0, 2) if line[1][1] - pre_i in Noun_pre.keys()]] if len(Noun_out[0]) == 0: del Noun_out[0] if len(Noun_out) == 0: break Neuclid_perS = [ {n: min([np.linalg.norm(pv - vec) for vec in No[n] for pv in pronoun_vec]) for n in No.keys()} for No in Noun_out] Neuclid_min_perS = [(perS.keys()[perS.values().index(min(perS.values()))], min(perS.values())) for perS in Neuclid_perS] toNoun = [N_ed[0] for N_ed in Neuclid_min_perS if min([nmp[1] for nmp in Neuclid_min_perS]) == N_ed[1]] case_df.ix[line[0], u"事象"] = case_df.ix[line[0], u"事象"].replace(l, toNoun[0]) Noun_pre[line[1][1]][Noun_pre[line[1][1]].index(l)] = toNoun[0] # ''' ''' #深層格における最大出力である名詞を選択 Noun_out = [{Np: max([output[1][di] for output in self.self.Dc.predict(Np, u"", line[1][3])]) for Np in Np_list if Np not in line[1][4:11].values} for Np_list in [Noun_pre[line[1][1] - pre_i] for pre_i in range(0, 2) if line[1][1] - pre_i in Noun_pre.keys()]] #del Noun_out[0][l] if len(Noun_out[0])==0: del Noun_out[0] if len(Noun_out)==0: break MaxN_perS = [No[No.keys()[No.values().index(max(No.values()))]] for No in Noun_out] SSen_rec = MaxN_perS.index(max(MaxN_perS)) toNoun = Noun_out[SSen_rec].keys()[Noun_out[SSen_rec].values().index(max(MaxN_perS))] case_df.ix[line[0], u"事象"] = case_df.ix[line[0], u"事象"].replace(l, toNoun) Noun_pre[line[1][1]][Noun_pre[line[1][1]].index(l)] = toNoun ''' # 埋まっていない深層格(ゼロ代名詞)の補完 Deep_cand = [] for i in [VC_Dc[VC] for VC in self.Dc.NV_class[1][line[1][3]] if VC in VC_Dc]: Deep_cand += i Count_perD = [Deep_cand.count(d) for d in self.Dc.DeepCaseList] Dc_toV = [Deep_cor for Deep_cor in [d for d, Deep_cor in zip(self.Dc.DeepCaseList, Count_perD) if sum(Count_perD) / float(len(Count_perD)) < Deep_cor]] Noun_zero = dict() for Dc_tmp in Dc_toV: if line[1][Dc_tmp] == u" ": Noun_out = [{Np: max([output[1][self.Dc.DeepCaseList.index(Dc_tmp)] for output in self.Dc.predict(Np, u"", line[1][3])]) for Np in Np_list if Np not in line[1][4:11].values} for Np_list in [Noun_pre[line[1][1] - pre_i] for pre_i in range(0, 2) if line[1][1] - pre_i in Noun_pre.keys()]] while {} in Noun_out: Noun_out.remove({}) if len(Noun_out) == 0: continue MaxN_perS = [No[No.keys()[No.values().index(max(No.values()))]] for No in Noun_out] SSen_rec = MaxN_perS.index(max(MaxN_perS)) if max(MaxN_perS) > thresold_perD[self.Dc.DeepCaseList.index(Dc_tmp)]: Noun_zero[Dc_tmp] = Noun_out[SSen_rec].keys()[Noun_out[SSen_rec].values().index(max(MaxN_perS))] if len(Noun_zero.keys()) > 0: case_zero = u"" for d, Noun_perD in zip(line[1][4:11].keys(), line[1][4:11].values): if d in Noun_zero.keys(): case_zero += u" " + Noun_zero[d] else: case_zero += u" " + Noun_perD case_zero += u" " + line[1][3] case_zero = re.sub(r" +", u" ", case_zero.strip()) case_df.ix[line[0], u"事象"] = case_zero for Noun_zero_tmp in Noun_zero.values(): Noun_pre[line[1][1]].append(Noun_zero_tmp) # 前の文に含まれる名詞が含まれているか if first_Sen == 0 and tail_key != -1: Record_id[(Report_id, Sentence_id)] = Record_id[tail_key] + 1 continue for pre_i in range(3, 0, -1): if line[1][1] - pre_i in Noun_pre.keys(): if set(Noun_pre[line[1][1]]).intersection(set(Noun_pre[line[1][1] - pre_i])): for pre_j in range(pre_i, -1, -1): if line[1][1] - pre_j in Noun_pre.keys(): Record_id[(Report_id, Sentence_id - pre_j)] = Record_id[ (Report_id, line[1][1] - pre_i)] break else: Record_id[(Report_id, Sentence_id)] = Record_id[(Report_id, line[1][1] - pre_i)] + 1 while (Report_id, Sentence_id) not in Record_id.keys(): pre_i += 1 if (Report_id, Sentence_id - pre_i) in Record_id.keys(): Record_id[(Report_id, Sentence_id)] = Record_id[(Report_id, line[1][1] - pre_i)] + 1 if first_Sen == len(case_df_perR[u"文_id"].drop_duplicates()) - 1: tail_key = (Report_id, Sentence_id) case_df[u"レコード_id"] = [(i, j) for i, j in zip(case_df[u"報告書_id"], case_df[u"文_id"])] case_df[u"レコード_id"] = case_df[u"レコード_id"].map(lambda x: Record_id[x]) return case_df
class Treport: def __init__(self, path): book = xlrd.open_workbook(path) sheets = book.sheets() self.s = sheets[0] def NV_class_load(self, NV_classpath): file = open(NV_classpath.decode("shift-jis").encode("utf-8")) NV_class = pickle.load(file) return NV_class def uniqword(seq): seen = set() seen_add = seen.add return [x for x in seq if x not in seen and not seen_add(x)] def delete_unnecc(self, i): noenc = self.s.cell_value(i, 3).replace(u'-', u'-') noenc = noenc.replace(u'~', u'~') noenc = noenc.replace(u'', u'') #不要報告書 noenc = noenc.replace(u'Ⅰ', u'1') noenc = noenc.replace(u'Ⅱ', u'2') noenc = noenc.replace(u'Ⅲ', u'3') noenc = noenc.replace(u'Ⅳ', u'4') noenc = noenc.replace(u'Ⅴ', u'5') noenc = noenc.replace(u'ⅰ', u'1') noenc = noenc.replace(u'ⅱ', u'2') noenc = noenc.replace(u'ⅲ', u'3') noenc = noenc.replace(u'ⅳ', u'4') noenc = noenc.replace(u'ⅴ', u'5') noenc = noenc.replace(u'⑪', u'11') noenc = noenc.replace(u'⑫', u'12') noenc = noenc.replace(u'⑰', u'17') noenc = noenc.replace(u'⑲', u'19') noenc = noenc.replace(u'№', u'No.') noenc = noenc.replace(u'㎎', u'mg') noenc = noenc.replace(u'㎜', u'mm') noenc = noenc.replace(u'㎡', u'm^2') noenc = noenc.replace(u'㍑', u'リットル') noenc = noenc.replace(u'槢', u'摺') noenc = noenc.replace(u'<', u'<') noenc = noenc.replace(u'>', u'>') return noenc def NV_extract(self): NList = [] VList = [] for i in range(1, self.s.nrows): print i noenc = self.delete_unnecc(i) lan = Language(noenc) if len(lan.str) > 4000: continue word = lan.getMorpheme() tmpNoun = u"" NN = 0 for j, line in enumerate(lan.getMorpheme()): if line[1] == u"動詞": if line[7] == u"する": VList.append(word[j - 1][0] + line[7]) else: VList.append(line[7]) if line[1] == u"名詞" and j != len(word) - 1: if word[j + 1][1] == u"名詞": tmpNoun += line[0] NN += 1 if NN > 3: tmpNoun = u"" NN = 0 continue elif word[j + 1][7] != u"する": NList.append(tmpNoun + line[0]) tmpNoun = u"" NN = 0 return NList, VList if __name__ == '__main__': path = u'D:/研究/データ/report_data_ver4_1.xlsx' from Treport import Treport TR = Treport(path) NV_classpath = "C:/tmp/Evaluation/NV_class.Word" NV_class = TR.NV_class_load(NV_classpath) NList, VList = TR.NV_extract() Wc = Word_class(NV_class) fnc = lambda x: Wc.Nclass.get(x, "No entry") Noun_uniq = uniqword(NList) Noun_uniq2 = Series(Noun_uniq).map(fnc) fvc = lambda x: Wc.Vclass.get(x, "No entry") Verb_uniq = uniqword(VList) Verb_uniq2 = Series(Verb_uniq).map(fvc) Noun_uniq3 = [] for N in list(Series(Noun_uniq)[Noun_uniq2 == "No entry"]): numin = False alpha = False N_uni = unicodedata.normalize('NFKC', N) if re.search("[0-9]", N_uni): numin = True if re.search("[{-~]", N_uni) or re.search( "[[-`]", N_uni) or re.search("[ -/]", N_uni) or re.search( "[:-@]", N_uni): alpha = True if numin is False and alpha is False: Noun_uniq3.append(N_uni) Verb_uniq3 = [] for V in list(Series(Verb_uniq)[Verb_uniq2 == "No entry"]): numin = False alpha = False V_uni = unicodedata.normalize('NFKC', V) if re.search("[0-9]", V_uni): numin = True if re.search("[{-~]", V_uni) or re.search( "[[-`]", V_uni) or re.search("[ -/]", V_uni) or re.search( "[:-@]", V_uni): alpha = True if numin is False and alpha is False: Verb_uniq3.append(V_uni) Noun_tail = [] i = 0 for compN in Noun_uniq3: print i i += 1 lan = Language(compN) word = lan.getMorpheme() Noun_tail.append(word[len(word) - 1][0]) Noun_tail_uniq = uniqword(Noun_tail) Noun_tail_uniq2 = Series(Noun_tail_uniq).map(fnc) Noun_tail_uniq3 = [] for N in list(Series(Noun_tail_uniq)[Noun_tail_uniq2 == "No entry"]): Noun_tail_uniq3.append(N)
def test_isPreposition(self): self.h = Preposition(Language()) self.assertEqual(self.h.isPreposition("puxod"), False) self.assertEqual(self.h.isPreposition("puxood"), False) self.assertEqual(self.h.isPreposition("pwdood"), True)
def Triple_extract(self, path): TR = Treport(path) triplelist = {} Mor_con = [[u"形容詞", u"助動詞", u"接続詞"], [u"連体化", u"並立助詞", u"読点", u"接続助詞"]] for i in range(1, TR.s.nrows): #if i>TR.s.nrows/2: break if i>10: break print i noenc = TR.delete_unnecc(i) #print TR.s.cell_value(i, 2).replace(u"-", u"") #print noenc for Sentence_id, perSen in enumerate(noenc.split(u"。")): # TR.s.cell_value(i, 2) Lan = Language(perSen) cabocha_xml = Lan.cabocha_command() chunkinfo, tokinfo, sentence_tok = Lan.chunk_structured(cabocha_xml) #triple_perR = [] #id_perR = [] for chunk in chunkinfo: compnoun_tail_id = -1 for tok_id, tokinfo_mor in enumerate(tokinfo[int(chunk[u"id"])]): #print tok_id, compnoun_tail_id if tok_id <= compnoun_tail_id: continue sentence_tok_set = sentence_tok[int(chunk[u"id"])] if tokinfo_mor[0]==u"名詞": Noun = sentence_tok_set[tok_id] compnoun_tail_id = tok_id for tok_id_noun in range(tok_id+1, len(tokinfo[int(chunk[u"id"])])-1): if tokinfo[int(chunk[u"id"])][tok_id_noun][0]==u"名詞" : if sentence_tok[int(chunk[u"id"])][tok_id_noun] == u"濃度": continue Noun += sentence_tok[int(chunk[u"id"])][tok_id_noun] compnoun_tail_id = tok_id_noun else: break if compnoun_tail_id+1 == len(tokinfo[int(chunk[u"id"])]): continue chunk_id_from = int(chunk[u"id"]) for i_from in reversed(range((int(chunk["id"])+1)*-1, 0)): if int(chunkinfo[int(chunk[u"id"])+i_from]["link"])==chunk_id_from: chunk_id_from -= 1 from_tail_tok = tokinfo[int(chunk[u"id"])+i_from][len(tokinfo[int(chunk[u"id"])+i_from])-1] if from_tail_tok[0] in Mor_con[0] or from_tail_tok[1] in Mor_con[1]: for sentence_tok_from in reversed(list(sentence_tok[int(chunkinfo[int(chunk[u"id"])+i_from]["id"])])): Noun = sentence_tok_from + Noun else: break else: break if tokinfo[int(chunk[u"id"])][compnoun_tail_id+1][0]==u"助詞" and tokinfo[int(chunk[u"id"])][compnoun_tail_id+1][1]!=u"接続助詞": Particle = tokinfo[int(chunk[u"id"])][compnoun_tail_id+1][6] Noun_suru = u"" for tok_id_link, tok_link_mor in enumerate(tokinfo[int(chunk[u"link"])]): if tok_link_mor[0]==u"名詞" and tok_link_mor[1]!=u"形容動詞語幹": Noun_suru += sentence_tok[int(chunk[u"link"])][tok_id_link] continue if tok_link_mor[0]==u"動詞" or tok_link_mor[0]==u"形容詞" or tok_link_mor[1]==u"形容動詞語幹": if tok_link_mor[1]!=u"末尾": Verb = u"" if tok_link_mor[6]==u"する" or tok_link_mor[6]==u"できる": Verb = Noun_suru+u"する" else: Verb = tok_link_mor[6] Verb_id = int(chunk[u"link"]) #数字以外を削除 if isinstance(TR.s.cell_value(i, 2), float): id_tuple = (TR.s.cell_value(i, 2), Sentence_id, Verb_id) else: if re.search("[0-9]", TR.s.cell_value(i, 2)) is None: id_tuple = (re.search("\d+[-]*\d+", TR.s.cell_value(i, 1)[re.search("\d+[-]*\d+", TR.s.cell_value(i, 1)).end():]).group(0).replace(u"-", u""), Sentence_id, Verb_id) else: id_tuple = (re.search("\d+[-]*\d+", TR.s.cell_value(i, 2)).group(0).replace(u"-", u""), Sentence_id, Verb_id) if id_tuple not in triplelist.keys(): triplelist[id_tuple] = [(Noun, Particle, Verb)] else: triple_tmp = triplelist[id_tuple] triple_tmp.append((Noun, Particle, Verb)) triplelist[id_tuple] = triple_tmp #print Noun, Particle, Verb, TR.s.cell_value(i, 2).replace(u"-", u""), Sentence_id, Verb_id break return triplelist
def setupUi(self, mainWindow): stylesheetFile = "Stylesheet.css" #styling fh = open(stylesheetFile) qstr = str(fh.read()) self.MainWindow = mainWindow #assign mainwindow self.MainWindow.setStyleSheet(qstr) self.MainWindow.setObjectName("MainWindow") self.MainWindow.resize(1000, 650) #setup basic variables self.devices = [] self.currentDevice = None self.setupLog() self.mainQueue = Queue() self.lang = Language(0) #fill mainwindow self.centralwidget = QtWidgets.QWidget(self.MainWindow) self.centralwidget.setObjectName("centralwidget") self.verticalLayoutWidget = QtWidgets.QWidget(self.centralwidget) self.verticalLayoutWidget.setGeometry(QtCore.QRect(0, 0, 90, 650)) self.verticalLayoutWidget.setObjectName("verticalLayoutWidget") self.verticalLayout = QtWidgets.QVBoxLayout(self.verticalLayoutWidget) self.verticalLayout.setContentsMargins(0, 0, 0, 0) self.verticalLayout.setObjectName("buttonBar") spacerItem = QtWidgets.QSpacerItem(20, 50, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Minimum) self.verticalLayout.addItem(spacerItem) self.addADevice = QtWidgets.QPushButton(self.verticalLayoutWidget) self.addADevice.setObjectName("addADevice") self.addADevice.setFixedSize(90, 90) self.verticalLayout.addWidget(self.addADevice) self.Manual = QtWidgets.QPushButton(self.verticalLayoutWidget) self.Manual.setObjectName("Manual") self.Manual.setFixedSize(90, 90) self.verticalLayout.addWidget(self.Manual) self.Graphs = QtWidgets.QPushButton(self.verticalLayoutWidget) self.Graphs.setObjectName("Graphs") self.Graphs.setFixedSize(90, 90) self.verticalLayout.addWidget(self.Graphs) self.Settings = QtWidgets.QPushButton(self.verticalLayoutWidget) self.Settings.setObjectName("Settings") self.Settings.setFixedSize(90, 90) self.verticalLayout.addWidget(self.Settings) spacerItem1 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding) self.verticalLayout.addItem(spacerItem1) self.Info = QtWidgets.QPushButton(self.verticalLayoutWidget) self.Info.setObjectName("Info") self.Info.setFixedSize(90, 90) self.verticalLayout.addWidget(self.Info) spacerItem2 = QtWidgets.QSpacerItem(20, 20, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Fixed) self.verticalLayout.addItem(spacerItem2) self.horizontalLayoutWidget = QtWidgets.QWidget(self.centralwidget) self.horizontalLayoutWidget.setGeometry(QtCore.QRect(90, 0, 910, 50)) self.horizontalLayoutWidget.setObjectName("horizontalLayoutWidget") self.horizontalLayout_2 = QtWidgets.QHBoxLayout(self.horizontalLayoutWidget) self.horizontalLayout_2.setContentsMargins(0, 0, 0, 0) self.horizontalLayout_2.setObjectName("horizontalLayout_2") self.Logo = QtWidgets.QLabel(self.horizontalLayoutWidget) self.Logo.setEnabled(True) sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Expanding) sizePolicy.setHorizontalStretch(0) sizePolicy.setVerticalStretch(0) sizePolicy.setHeightForWidth(self.Logo.sizePolicy().hasHeightForWidth()) self.Logo.setSizePolicy(sizePolicy) self.Logo.setMinimumSize(QtCore.QSize(0, 0)) self.Logo.setMaximumSize(QtCore.QSize(250, 50)) font = QtGui.QFont() font.setFamily("Calibri") font.setPointSize(16) font.setBold(True) font.setWeight(75) self.Logo.setFont(font) self.Logo.setAutoFillBackground(True) self.Logo.setFrameShape(QtWidgets.QFrame.Box) self.Logo.setFrameShadow(QtWidgets.QFrame.Raised) self.Logo.setObjectName("Logo") pic = QPixmap('rsz_1aerosdev') self.Logo.setPixmap(pic) self.horizontalLayout_2.addWidget(self.Logo) spacerItem3 = QtWidgets.QSpacerItem(20, 20, QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Minimum) self.horizontalLayout_2.addItem(spacerItem3) self.fSkyTemp = QtWidgets.QFrame(self.horizontalLayoutWidget) sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Expanding) sizePolicy.setHorizontalStretch(0) sizePolicy.setVerticalStretch(0) sizePolicy.setHeightForWidth(self.fSkyTemp.sizePolicy().hasHeightForWidth()) self.fSkyTemp.setSizePolicy(sizePolicy) self.fSkyTemp.setMinimumSize(QtCore.QSize(180, 100)) self.fSkyTemp.setFrameShape(QtWidgets.QFrame.StyledPanel) self.fSkyTemp.setFrameShadow(QtWidgets.QFrame.Raised) self.fSkyTemp.setObjectName("fSkyTemp") self.Sky = QtWidgets.QLabel(self.fSkyTemp) self.Sky.setGeometry(QtCore.QRect(10, 20, 75, 13)) self.Sky.setObjectName("Sky") self.TempUp = QtWidgets.QLabel(self.fSkyTemp) self.TempUp.setGeometry(QtCore.QRect(100, 20, 75, 13)) sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Preferred) sizePolicy.setHorizontalStretch(0) sizePolicy.setVerticalStretch(0) sizePolicy.setHeightForWidth(self.TempUp.sizePolicy().hasHeightForWidth()) self.TempUp.setSizePolicy(sizePolicy) self.TempUp.setMinimumSize(QtCore.QSize(60, 0)) self.TempUp.setObjectName("TempUp") self.horizontalLayout_2.addWidget(self.fSkyTemp) spacerItem4 = QtWidgets.QSpacerItem(40, 20, QtWidgets.QSizePolicy.Fixed, QtWidgets.QSizePolicy.Minimum) self.horizontalLayout_2.addItem(spacerItem4) self.stackedWidget = QtWidgets.QStackedWidget(self.centralwidget) self.stackedWidget.setGeometry(QtCore.QRect(90, 50, 910, 600)) # self.stackedWidget.setMinimumSize(QtCore.QSize(600, 600)) #400, 400 # self.stackedWidget.move(100,100) # self.stackedWidget.setStyleSheet("background-color: black") # sets up maingrid and adds it to stacked widget self.page0 = QtWidgets.QWidget(self.MainWindow) self.mainGrid = MainGrid(self.page0, self.devices) self.stackedWidget.addWidget(self.mainGrid.page0) #sets up pages self.setupSettingsWindow() self.setupEnterDevice() self.setupGraphsWindow() self.setupManual() #sets starting page self.stackedWidget.setCurrentIndex(0) #binds functions to mainwindow buttons self.addADevice.clicked.connect(lambda: self.setIndex(2)) self.Manual.clicked.connect(lambda: self.setIndex(4)) self.Graphs.clicked.connect(lambda: self.setIndex(3)) self.Settings.clicked.connect(lambda: self.setIndex(1)) self.Info.clicked.connect(self.showInfo) QtCore.QMetaObject.connectSlotsByName(self.MainWindow) self.MainWindow.setCentralWidget(self.centralwidget) self.retranslateUi(0)
from Language import Language for _ in range(10): lang = Language() for _ in range(5): print(lang.name()) print('') # s = '' # for _ in range(10): # s += lang.word() + ' ' # print(s.capitalize()) # s = '' # for _ in range(10): # s += lang.word() + ' ' # print(s.capitalize()) # print('') # print(lang.name())