def checkCfgConfig(self, cfg_params): """ """ self.datasvc_url = cfg_params.get("USER.datasvc_url", self.datasvc_url) self.srm_version = cfg_params.get("USER.srm_version", 'srmv2') self.node = cfg_params.get('USER.storage_element', None) self.usenamespace = cfg_params.get("USER.usenamespace", 0) self.user_remote_dir = cfg_params.get("USER.user_remote_dir", '') self.publish_data = cfg_params.get("USER.publish_data", 0) if int(self.publish_data) == 1: # only accepts valid PhEDEx Node Names import Lexicon try: Lexicon.cmsname(self.node) except Exception, text: msg = "%s\n'%s' is not a valid Phedex Node Name" % (text, self.node) msg += "\n***************** NOTICE ***************" msg += "\nOnly valid Phexex Node Names can be used as location for published data" msg += "\nPlease fix storage_element or set publish_data=0 in [USER] section of crab.cfg" if 'group' in self.user_remote_dir: msg += '\nIf you are trying to stage out to some /store/group area, you can do like:' msg += '\n storage_element = T2_US_UCSD' msg += '\n user_remote_dir = /store/group/foo/bar' msg += "\n****************************************" raise CrabException(msg)
def checkCfgConfig(self,cfg_params): """ """ self.datasvc_url = cfg_params.get("USER.datasvc_url",self.datasvc_url) self.srm_version = cfg_params.get("USER.srm_version",'srmv2') self.node = cfg_params.get('USER.storage_element',None) self.usenamespace = cfg_params.get("USER.usenamespace",0) self.user_remote_dir = cfg_params.get("USER.user_remote_dir",'') self.publish_data = cfg_params.get("USER.publish_data",0) if int(self.publish_data) == 1: # only accepts valid PhEDEx Node Names import Lexicon try: Lexicon.cmsname(self.node) except Exception, text: msg = "%s\n'%s' is not a valid Phedex Node Name" % (text,self.node) msg += "\n***************** NOTICE ***************" msg += "\nOnly valid Phexex Node Names can be used as location for published data" msg += "\nPlease fix storage_element or set publish_data=0 in [USER] section of crab.cfg" if 'group' in self.user_remote_dir: msg += '\nIf you are trying to stage out to some /store/group area, you can do like:' msg += '\n storage_element = T2_US_UCSD' msg += '\n user_remote_dir = /store/group/foo/bar' msg += "\n****************************************" raise CrabException(msg)
def runLexicon(articles, sentLex): # generate dictionary containing articles and their scores articleScores = Lexicon.allScores(articles, sentLex) # get the count of POS/NEG/NEUt articles POS, NEG, NEUT = Lexicon.categorizeScores(articleScores) return POS, NEG, NEUT
def getEndpoint(self): ''' Return full SE endpoint and related infos ''' self.lfn = self.getLFN() if int(self.publish_data) == 1: try: # this is not a full LFN, only the path part, add dummy filename for validating format Lexicon.lfn(self.lfn + "dummy.root") except Exception, text: msg = "PhEDExDataSvcInfo.py: ERROR in generated LFN :\n%s" % text raise CrabException(msg)
def getEndpoint(self): ''' Return full SE endpoint and related infos ''' self.lfn = self.getLFN() if int(self.publish_data) == 1 : try: # this is not a full LFN, only the path part, add dummy filename for validating format Lexicon.lfn(self.lfn+"dummy.root") except Exception, text: msg = "PhEDExDataSvcInfo.py: ERROR in generated LFN :\n%s" % text raise CrabException(msg)
def CleanOutput_FeatureLeave(self): a = JsonClass() a.text = self.text if self.norm != self.text: a.norm = self.norm if self.pnorm: a.pnorm = self.pnorm if self.iepair: a.iepair = self.iepair if self.atom != self.text: a.atom = self.atom features = [ FeatureOntology.GetFeatureName(f) for f in Lexicon.CopyFeatureLeaves(self.features) if f not in FeatureOntology.NotShowList ] for f in features: # if isinstance(f, int): # f = "L" + str(f) setattr(a, f, '') a.StartOffset = self.StartOffset a.EndOffset = self.EndOffset if self.UpperRelationship: a.UpperRelationship = self.UpperRelationship if self.sons \ and utils.FeatureID_0 not in self.features: a.sons = [s.CleanOutput_FeatureLeave() for s in self.sons] #logging.info("in featureleave" + str(self) + "f:" + str(features)) return a
def init_static_dialog_agent(args) : print "reading in Ontology" ont = Ontology.Ontology(sys.argv[1]) print "predicates: " + str(ont.preds) print "types: " + str(ont.types) print "entries: " + str(ont.entries) print "reading in Lexicon" lex = Lexicon.Lexicon(ont, sys.argv[2]) print "surface forms: " + str(lex.surface_forms) print "categories: " + str(lex.categories) print "semantic forms: " + str(lex.semantic_forms) print "entries: " + str(lex.entries) print "instantiating Feature Extractor" f_extractor = FeatureExtractor.FeatureExtractor(ont, lex) print "instantiating Linear Learner" learner = LinearLearner.LinearLearner(ont, lex, f_extractor) print "instantiating KBGrounder" grounder = KBGrounder.KBGrounder(ont) load_parser_from_file = False if len(args) > 4 : if args[4].lower() == 'true' : load_parser_from_file = True if load_parser_from_file : parser = load_model('static_parser') grounder.parser = parser grounder.ontology = parser.ontology else : print "instantiating Parser" parser = Parser.Parser(ont, lex, learner, grounder, beam_width=10, safety=True) print "instantiating Generator" generator = Generator.Generator(ont, lex, learner, parser, beam_width=sys.maxint, safety=True) print "instantiating DialogAgent" static_policy = StaticDialogPolicy.StaticDialogPolicy() A = StaticDialogAgent(parser, generator, grounder, static_policy, None, None) if not load_parser_from_file : print "reading in training data" D = A.read_in_utterance_action_pairs(args[3]) if len(args) > 4 and args[4] == "both": print "training parser and generator jointly from actions" converged = A.jointly_train_parser_and_generator_from_utterance_action_pairs( D, epochs=10, parse_beam=30, generator_beam=10) else: print "training parser from actions" converged = A.train_parser_from_utterance_action_pairs( D, epochs=10, parse_beam=30) print "theta: "+str(parser.learner.theta) save_model(parser, 'static_parser') return A
def expandIntoListOfPhedexNodeNames(location_list): """ take as input a list of locations, returns a list of PNN's raise CrabExceptoin if input is not a valid PNN abbreviation use https://cmsweb.cern.ch/phedex/datasvc/doc/nodes """ # build API node filter, add wildcards wich are not required by Crab2 args = '' for loc in location_list: phedexNode = loc.strip() try: Lexicon.cmsname(phedexNode) except Exception, text: msg = "%s\n'%s' is not a valid Phedex Node Name" % (text,phedexNode) raise CrabException(msg) args += "&node=%s*" % phedexNode
def expandIntoListOfPhedexNodeNames(location_list): """ take as input a list of locations, returns a list of PNN's raise CrabExceptoin if input is not a valid PNN abbreviation use https://cmsweb.cern.ch/phedex/datasvc/doc/nodes """ # build API node filter, add wildcards wich are not required by Crab2 args = '' for loc in location_list: phedexNode = loc.strip() try: Lexicon.cmsname(phedexNode) except Exception, text: msg = "%s\n'%s' is not a valid Phedex Node Name" % (text, phedexNode) raise CrabException(msg) args += "&node=%s*" % phedexNode
def validateBWLists(cfg_params): # convert to lists for processing. But leave cfg_params # as strings, since this is what Crab2 code expects blackList = cfg_params.get("GRID.se_black_list", []) if type(blackList) == type("string"): blackList = blackList.strip().split(',') whiteList = cfg_params.get("GRID.se_white_list", []) if type(whiteList) == type("string"): whiteList = whiteList.strip().split(',') # make sure each item in the list is a valid cms node name # or possibly a shortcut like T3 for site in blackList: try: Lexicon.cmsname(site) except Exception, text: msg = "ERROR in GRID.se_black_list: %s\n" % blackList msg += "%s\n'%s' is not a valid Phedex Node Name" % (text, site) raise CrabException(msg)
def validateBWLists(cfg_params): # convert to lists for processing. But leave cfg_params # as strings, since this is what Crab2 code expects blackList = cfg_params.get("GRID.se_black_list", [] ) if type(blackList) == type("string") : blackList = blackList.strip().split(',') whiteList = cfg_params.get("GRID.se_white_list", [] ) if type(whiteList) == type("string") : whiteList = whiteList.strip().split(',') # make sure each item in the list is a valid cms node name # or possibly a shortcut like T3 for site in blackList: try: Lexicon.cmsname(site) except Exception, text: msg = "ERROR in GRID.se_black_list: %s\n" % blackList msg += "%s\n'%s' is not a valid Phedex Node Name" % (text,site) raise CrabException(msg)
def reset_lex(self): self.evolution_steps = 0 self.evolution_steps_label['text'] = self.evolution_steps self.lexicon = Lexicon(self.lexicon_size(), phones=self.n_symbols(), frequency_groups=self.lexicon.frequency_groups, hard_max_length=self.lexicon.hard_max_length, hard_start_length=self.hard_word_length()) # figure out minimum length needed for whole lexicon total_possible_forms = 0 for i in range(1, self.lexicon.hard_max_length): total_possible_forms += self.n_symbols()**i if total_possible_forms > len(self.lexicon): break self.min_len_needed[ 'text'] = 'max length needed for lexicon: {0}'.format(i) self.possible_forms['text'] = 'possible forms at length: {0}'.format( total_possible_forms) # zipf self.plot_3.cla() sorted_unig = sorted([w.unigram for w in self.lexicon.words]) self.plot_3.set_xlim(0, self.lexicon.hard_max_length) self.plot_3.set_ylim(sorted_unig[0] - 1, sorted_unig[-1] + 1) self.plot_3.set_title('word length and unigram word information') lengths, unigrams = zip(*self.lexicon.lengths_and_unigrams()) slope, intercept, r_value, p_value, std_err = stats.linregress( lengths, unigrams) unig_pred = intercept + (slope * np.arange(self.lexicon.hard_max_length)) self.zipf_scatter, = self.plot_3.plot(lengths, unigrams, 'o') self.zipf_line, = self.plot_3.plot( np.arange(self.lexicon.hard_max_length), unig_pred) self.update()
def init_pomdp_dialog_agent(args) : print "Reading in Ontology" ont = Ontology.Ontology(args[1]) print "predicates: " + str(ont.preds) print "types: " + str(ont.types) print "entries: " + str(ont.entries) print "Reading in Lexicon" lex = Lexicon.Lexicon(ont, args[2]) print "surface forms: " + str(lex.surface_forms) print "categories: " + str(lex.categories) print "semantic forms: " + str(lex.semantic_forms) print "entries: " + str(lex.entries) print "Instantiating Feature Extractor" f_extractor = FeatureExtractor.FeatureExtractor(ont, lex) print "Instantiating Linear Learner" learner = LinearLearner.LinearLearner(ont, lex, f_extractor) print "Instantiating KBGrounder" grounder = KBGrounder.KBGrounder(ont) load_models_from_file = False if len(args) > 4 : if args[4].lower() == 'true' : load_models_from_file = True if load_models_from_file : parser = load_model('pomdp_parser') grounder.parser = parser grounder.ontology = parser.ontology else : print "Instantiating Parser" parser = Parser.Parser(ont, lex, learner, grounder, beam_width=10) print "Instantiating DialogAgent" if load_models_from_file : agent = PomdpDialogAgent(parser, grounder, None, None, parse_depth=10, load_policy_from_file=True) else : agent = PomdpDialogAgent(parser, grounder, None, None, parse_depth=10, load_policy_from_file=False) if not load_models_from_file : print "reading in data and training parser from actions" D = agent.read_in_utterance_action_pairs(args[3]) converged = agent.train_parser_from_utterance_action_pairs(D, epochs=10, parse_beam=30) print "theta: "+str(parser.learner.theta) save_model(parser, 'pomdp_parser') #print 'Parser ontology : ', parser.ontology.preds return agent
def __init__(self, polarity_strategy=None, ego_involvement_strategy=None, lexicon_size=20, agent_vector_size=20, lexicon_output_location=None): self.uidgen = UID() #the unique id generator for this builder self.polarity_strategy = polarity_strategy #the polarity generator for this set of agents self.ego_involvement_strategy = ego_involvement_strategy #the ego-involvement parameter generator for this set of agents self.lex = Lexicon.Lexicon(cloudsize=lexicon_size, vector_size=agent_vector_size, filePath=lexicon_output_location) self.__numAgents = 0 #the size of this group of agents
def newnode(self, start, count, compound=False): #logging.info("new node: start=" + str(start) + " count=" + str(count)) if not self.head: raise RuntimeError( "This SentenceLinkedList is null! Can't combine.") if start + count > self.size: logging.error(self.__str__()) raise RuntimeError("Can't get " + str(count) + " items start from " + str(start) + " from the sentence!") startnode = self.get(start) endnode = self.get(start + count - 1) p = startnode sons = [] EndOffset = p.StartOffset NewText = "" NewNorm = "" NewAtom = "" hasUpperRelations = [] for i in range(count): if i == 0: spaces = "" else: if compound: spaces = "_" else: spaces = " " * (p.StartOffset - EndOffset) EndOffset = p.EndOffset NewText += spaces + p.text NewNorm += spaces + p.norm NewAtom += spaces + p.atom if p.UpperRelationship and p.UpperRelationship != 'H': hasUpperRelations.append( FeatureOntology.GetFeatureID("has" + p.UpperRelationship)) sons.append(p) p = p.next NewNode = SentenceNode(NewText) NewNode.norm = NewNorm NewNode.atom = NewAtom NewNode.sons = sons NewNode.StartOffset = startnode.StartOffset NewNode.EndOffset = endnode.EndOffset Lexicon.ApplyWordLengthFeature(NewNode) for haverelation in hasUpperRelations: NewNode.ApplyFeature(haverelation) return NewNode, startnode, endnode
def generatePrefixDict(): global Possibility, PrefixDict, total, Dictionary_URL fo = Lexicon.loadDict(Dictionary_URL) PrefixDict = set() FREQ = {} for line in fo.read().rstrip().split("\n"): word, freq = line.split(' ')[:2] FREQ[word] = float(freq) total += float(freq) #calculate the total number of words for idx in range(len(word)): #generate the prefix dictionary prefix = word[0:idx + 1] PrefixDict.add(prefix) fo.close() #Transform the freq into possibilities Possibility = dict( (key, log(value / total)) for key, value in FREQ.items())
def LexicalAnalyzeTask( SubSentence, schema): NodeList = Tokenization.Tokenize(SubSentence) if not NodeList or NodeList.size == 0: return None, None Lexicon.ApplyLexiconToNodes(NodeList) # print("after ApplyLexiconToNodes" + OutputStringTokens_oneliner(NodeList)) PrepareJSandJM(NodeList) #Lexicon.LexiconoQoCLookup(NodeList) NodeList, Dag, WinningRules = DynamicPipeline(NodeList, schema) # t = Thread(target=Cache.WriteSentenceDB, args=(SubSentence, NodeList)) # t.start() return NodeList, Dag, WinningRules
def init_dialog_agent(args): print "Reading in Ontology" ont = Ontology.Ontology(args[1]) print "predicates: " + str(ont.preds) print "types: " + str(ont.types) print "entries: " + str(ont.entries) print "Reading in Lexicon" lex = Lexicon.Lexicon(ont, args[2]) print "surface forms: " + str(lex.surface_forms) print "categories: " + str(lex.categories) print "semantic forms: " + str(lex.semantic_forms) print "entries: " + str(lex.entries) print "Instantiating Feature Extractor" f_extractor = FeatureExtractor.FeatureExtractor(ont, lex) print "Instantiating Linear Learner" learner = LinearLearner.LinearLearner(ont, lex, f_extractor) print "Instantiating KBGrounder" grounder = KBGrounder.KBGrounder(ont) print "Instantiating Parser" parser = Parser.Parser(ont, lex, learner, grounder, beam_width=10) parser = load_model('parser') grounder.parser = parser grounder.ontology = parser.ontology print "Instantiating DialogAgent" agent = PomdpDialogAgent(parser, grounder, None, None) #print "reading in data and training parser from actions" #D = agent.read_in_utterance_action_pairs(args[3]) #converged = agent.train_parser_from_utterance_action_pairs(D, epochs=10, parse_beam=30) #print "theta: "+str(parser.learner.theta) #save_model(parser, 'parser') #print 'Parser ontology : ', parser.ontology.preds return agent
def __init__(self, args): print 'args = ', args, '\n\n\n\n' if len(args) < 4: print 'Usage ', args[ 0], ' ont_file lex_file parser_train_pairs_file [load_models_from_file=true/false]' rospy.init_node('dialog_agent_aishwarya') self.user_log = open(MAIN_LOG_PATH + USER_LOG_FILE, 'a') self.error_log = open(MAIN_LOG_PATH + MAIN_ERROR_LOG_FILE, 'a') self.started_users = set() print "reading in Ontology" ont = Ontology.Ontology(args[1]) print "predicates: " + str(ont.preds) print "types: " + str(ont.types) print "entries: " + str(ont.entries) self.ont = ont print "reading in Lexicon" lex = Lexicon.Lexicon(ont, args[2]) print "surface forms: " + str(lex.surface_forms) print "categories: " + str(lex.categories) print "semantic forms: " + str(lex.semantic_forms) print "entries: " + str(lex.entries) self.lex = lex self.parser_train_file = args[3] self.load_models_from_file = False if len(args) > 4: if args[4].lower() == 'true': print 'Going to load from file' # DEBUG self.load_models_from_file = True self.lock = Lock() self.service = rospy.Service('register_user', register_user, self.on_user_receipt)
import sys sys.path.append('.') # necessary to import local libraries import Ontology import Lexicon import CKYParser print "reading in Ontology" ont = Ontology.Ontology(sys.argv[1]) print "predicates: " + str(ont.preds) print "types: " + str(ont.types) print "entries: " + str(ont.entries) print "reading in Lexicon" lex = Lexicon.Lexicon(ont, sys.argv[2], word_embeddings_fn=sys.argv[5]) print "surface forms: " + str(lex.surface_forms) print "categories: " + str(lex.categories) print "semantic forms: " + str(lex.semantic_forms) print "entries: " + str(lex.entries) print "instantiating CKYParser" parser = CKYParser.CKYParser(ont, lex, lexicon_weight=1) parser.allow_merge = False parser.max_multiword_expression = 1 parser.max_missing_words_to_try = 0 print "reading in data and beginning training test" d = parser.read_in_paired_utterance_semantics(sys.argv[3]) converged = parser.train_learner_on_semantic_forms(d, 20, reranker_beam=10) if not converged:
whiteList = whiteList.strip().split(',') # make sure each item in the list is a valid cms node name # or possibly a shortcut like T3 for site in blackList: try: Lexicon.cmsname(site) except Exception, text: msg = "ERROR in GRID.se_black_list: %s\n" % blackList msg += "%s\n'%s' is not a valid Phedex Node Name" % (text,site) raise CrabException(msg) for site in whiteList: try: Lexicon.cmsname(site) except Exception, text: msg = "ERROR in GRID.se_white_list: %s\n" % whiteList msg += "%s\n'%s' is not a valid Phedex Node Name" % (text,site) raise CrabException(msg) def parseIntoList(param): """ to be used to make sure that one crab config parameter is usable as a list of strings, eve if it is a string with comma insides in the config. file """ if type(param) == type("string") : list = param.split(',') for item in list: item = item.strip()
class EvolGUI(): def __init__(self, lexicon): self.lexicon = lexicon # each `root` object is a window # open one for each figure and all the buttons/sliders self.root = tk.Tk() self.root.title("Segmental Information by position") self.fig1 = Figure() self.canvas = FigureCanvasTkAgg(self.fig1, master=self.root) self.canvas.get_tk_widget().grid(row=0, column=0) self.root2 = tk.Tk() self.root2.title("Word Length Distribution") self.fig2 = Figure() self.canvas2 = FigureCanvasTkAgg(self.fig2, master=self.root2) self.canvas2.get_tk_widget().grid(row=0, column=0) self.root3 = tk.Tk() self.root3.title("Word Length and -log Word Probability") self.fig3 = Figure() self.canvas3 = FigureCanvasTkAgg(self.fig3, master=self.root3) self.canvas3.get_tk_widget().grid(row=0, column=0) self.root4 = tk.Tk() self.root4.title("Lexical Phoneme distribution") self.fig4 = Figure() self.canvas4 = FigureCanvasTkAgg(self.fig4, master=self.root4) self.canvas4.get_tk_widget().grid(row=0, column=0) self.root5 = tk.Tk() self.root5.title("Positional Entropy") self.fig5 = Figure() self.canvas5 = FigureCanvasTkAgg(self.fig5, master=self.root5) self.canvas5.get_tk_widget().grid(row=0, column=0) self.interaction_root = tk.Tk() self.interaction_root.title("Lexical Evolution GUI") button_frame = tk.Frame(self.interaction_root) button_frame.grid(row=0, column=0) self.evolution_steps = 0 tk.Label(button_frame, text='evolution steps:').grid(row=0, column=0) self.evolution_steps_label = tk.Label(button_frame, text=self.evolution_steps) self.evolution_steps_label.grid(row=0, column=1) tk.Label(button_frame, text='lexicon size').grid(row=1, column=0) tk.Label(button_frame, text='n. symbols').grid(row=1, column=1) self.last_lexicon_size = len(self.lexicon) self.last_n_symbols = len(self.lexicon.phones) self.lexicon_size_text = tk.Entry(button_frame, width=6) self.lexicon_size_text.grid(row=2, column=0) self.lexicon_size_text.insert(0, str(self.last_lexicon_size)) self.n_symbols_text = tk.Entry(button_frame, width=4) self.n_symbols_text.grid(row=2, column=1) self.n_symbols_text.insert(0, str(self.last_n_symbols)) # figure out minimum length needed for whole lexicon total_possible_forms = 0 for i in range(1, self.lexicon.hard_max_length): total_possible_forms += self.n_symbols()**i if total_possible_forms > len(self.lexicon): break self.min_len_needed = tk.Label( button_frame, text='max length needed for lexicon: {0}'.format(i)) self.min_len_needed.grid(row=3, column=0) self.possible_forms = tk.Label( button_frame, text='possible forms at length: {0}'.format(total_possible_forms)) self.possible_forms.grid(row=3, column=1) tk.Label(button_frame, text='starting word length (-1 for Zipfian)').grid(row=4, column=0) self.last_hard_word_length = 6 self.hard_word_length_text = tk.Entry(button_frame, width=4) self.hard_word_length_text.grid(row=4, column=1) self.hard_word_length_text.insert(0, str(self.last_hard_word_length)) slider_frame = tk.Frame(self.interaction_root) slider_frame.grid(row=0, column=1) tk.Label(slider_frame, text='lexicon name').grid(row=0, column=1) self.last_lex_name = '' self.lex_name_text = tk.Entry(slider_frame, width=6) self.lex_name_text.grid(row=1, column=1) self.lex_name_text.insert(0, str(self.last_lex_name)) self.merger_p_slider = tk.Scale(slider_frame, from_=0, to=100, orient=tk.HORIZONTAL, label='merger prob.') self.merger_p_slider.grid(row=2, column=1) self.merger_p_slider.set(85) # determine skew in distribution for merger tk.Label(slider_frame, text='phone. dist. E').grid(row=3, column=1) self.last_symbol_E = 1.5 self.symbol_E_text = tk.Entry(slider_frame, width=4) self.symbol_E_text.grid(row=4, column=1) self.symbol_E_text.insert(0, str(self.last_symbol_E)) # skew in distribution for chosing word to undergo merger/deletion tk.Label(slider_frame, text='word E').grid(row=3, column=0) self.last_word_E = 1. self.word_E_text = tk.Entry(slider_frame, width=4) self.word_E_text.grid(row=4, column=0) self.word_E_text.insert(0, str(self.last_word_E)) # skew in distribution for chosing segment in word which will undergo merger/deletion tk.Label(slider_frame, text='segment E').grid(row=3, column=2) self.last_segment_E = 1.5 self.segment_E_text = tk.Entry(slider_frame, width=4) self.segment_E_text.grid(row=4, column=2) self.segment_E_text.insert(0, str(self.last_segment_E)) nb_steps = 1000 # prepare the buttons tk.Button(button_frame, text="One Step", command=self.step).grid(row=5, column=0) tk.Button(button_frame, text="{0} Steps".format(nb_steps), command=lambda: self.step(nb_steps)).grid(row=5, column=1) tk.Button(button_frame, text="Reset Lexicon", command=self.reset_lex).grid(row=6, column=0) tk.Button(button_frame, text="Quit", command=sys.exit).grid(row=6, column=1) tk.Button(button_frame, text='Save Lexicon', command=lambda: self.lexicon.save('lex_{0}_{1}.txt'.format( self.lex_name_text.get(), self.evolution_steps))).grid( row=7, column=0) tk.Button(button_frame, text='Save Plots', command=self.save_plots).grid(row=7, column=1) # prepare the line graph self.plot_1 = self.fig1.subplots() max_si = 0 self.avg_si_lines = [] if self.lexicon.frequency_groups == 2: line_labels = ['high frequency', 'low frequency'] else: line_labels = [ 'group {0}'.format(i + 1) for i in range(self.lexicon.frequency_groups) ] for i in range(self.lexicon.frequency_groups): x = np.arange(self.lexicon.hard_max_length) + 1 avg_si = self.lexicon.avg_segmental_info(which_group=i + 1) new_line, = self.plot_1.plot(x, avg_si, color=_colors[i], label=line_labels[i]) self.avg_si_lines.append(new_line) max_si = max(max_si, max(avg_si)) self.plot_1.set_xlim(1, self.lexicon.hard_max_length) for y_lim in range(5, 50, 5): if y_lim > max_si: break self.plot_1.set_ylim(-.5, y_lim) self.plot_1.legend(handles=self.avg_si_lines) self.plot_1.set_xlabel('seg. position') self.plot_1.set_ylabel('mean seg. info.') #self.plot_1.set_title('avg. seg info') # prep the word length histogram self.plot_2 = self.fig2.subplots() hist_data = [ self.lexicon.word_lengths(i + 1) for i in range(self.lexicon.frequency_groups) ] self.wl_hist = self.plot_2.hist( hist_data, range=(1, self.lexicon.hard_max_length), stacked=False, color=_colors[:self.lexicon.frequency_groups]) self.plot_2.set_xlabel('word length') self.plot_2.set_ylabel('count') # zipf! sorted_unig = sorted([w.unigram for w in self.lexicon.words]) self.plot_3 = self.fig3.subplots() self.plot_3.set_xlim(0, self.lexicon.hard_max_length) self.plot_3.set_ylim(sorted_unig[0] - 1, sorted_unig[-1] + 1) self.plot_3.set_xlabel('word length') self.plot_3.set_ylabel('-log word prob.') #self.plot_3.set_title('word length and unigram word information') lengths, unigrams = zip(*self.lexicon.lengths_and_unigrams()) slope, intercept, r_value, p_value, std_err = stats.linregress( lengths, unigrams) unig_pred = intercept + (slope * np.arange(self.lexicon.hard_max_length)) self.zipf_scatter, = self.plot_3.plot(lengths, unigrams, 'o') self.zipf_line, = self.plot_3.plot( np.arange(self.lexicon.hard_max_length), unig_pred) # phoneme distribution """ self.plot_4 = self.fig4.subplots(2) ks, ps = p_dist_to_lists(self.lexicon.seg_ps, sort_by_keys = True) self.phoneme_dist_bars = self.plot_4[0].bar(np.arange(len(ks)), ps, color = _colors[-3]) self.plot_4[0].set_ylim(0,.75 if max(ps) < .75 else 1) self.plot_4[0].set_xticks(np.arange(len(ks))) self.plot_4[0].set_xticklabels(ks) self.plot_4[0].set_title('seg. distribution in lexicon') # segmental entropy self.edge_ent_bars = [] for i, edge_ent in enumerate(self.lexicon.edge_entropies()): new_bar = self.plot_4[1].bar((i * .5) + (1.5 * np.arange(self.lexicon.frequency_groups)), edge_ent, color = _colors[-2+i], width = .5) self.edge_ent_bars.append(new_bar) self.plot_4[1].set_xticks(.25 + np.arange(self.lexicon.frequency_groups) * 1.5) if self.lexicon.frequency_groups == 2: self.plot_4[1].set_xticklabels(['high frequency', 'low frequency']) else: self.plot_4[1].set_xticklabels(['group {0}'.format(i + 1) for i in range(self.lexicon.frequency_groups)]) self.plot_4[1].set_title('seg. entropy - first/last segment') self.plot_4[1].legend(labels = ['first', 'last']) """ self.plot_4 = self.fig4.subplots() ks, ps = p_dist_to_lists(self.lexicon.seg_ps, sort_by_keys=True) self.phoneme_dist_bars = self.plot_4.bar(np.arange(len(ks)), ps, color=_colors[-3]) self.plot_4.set_ylim(0, .75 if max(ps) < .75 else 1) self.plot_4.set_xticks(np.arange(len(ks))) self.plot_4.set_xticklabels(ks) self.plot_4.set_title('seg. distribution in lexicon') # positional entropy self.plot_5 = self.fig5.subplots() self.pos_ent_lines = [] max_pe = 0 # could use this from above but this makes it easier to read... if self.lexicon.frequency_groups == 2: line_labels = ['high frequency', 'low frequency'] else: line_labels = [ 'group {0}'.format(i + 1) for i in range(self.lexicon.frequency_groups) ] for i in range(self.lexicon.frequency_groups): x = np.arange(self.lexicon.hard_max_length) + 1 pos_ent = self.lexicon.positional_entropy(which_group=i + 1) new_line, = self.plot_5.plot(x, pos_ent, color=_colors[i], label=line_labels[i]) self.pos_ent_lines.append(new_line) max_pe = max(max_pe, max(pos_ent)) self.plot_5.set_xlim(1, self.lexicon.hard_max_length) for y_lim in range(5, 30, 3): if y_lim > max_pe: break self.plot_5.set_ylim(-.5, y_lim) self.plot_5.legend(handles=self.pos_ent_lines) tk.mainloop() def save_plots(self): self.fig1.savefig('plts/plt_1_iter_{0}.png'.format( self.evolution_steps)) self.fig2.savefig('plts/plt_2_iter_{0}.png'.format( self.evolution_steps)) self.fig3.savefig('plts/plt_3_iter_{0}.png'.format( self.evolution_steps)) self.fig4.savefig('plts/plt_4_iter_{0}.png'.format( self.evolution_steps)) self.fig5.savefig('plts/plt_5_iter_{0}.png'.format( self.evolution_steps)) def update(self): # put all the updating in try/catch in case someone closes a window try: # update the plot with new data max_si = 0 for i, line in enumerate(self.avg_si_lines): avg_si = self.lexicon.avg_segmental_info(which_group=i + 1) line.set_ydata(avg_si) max_si = max(max_si, max(avg_si)) for y_lim in range(5, 50, 5): if y_lim > max_si: break self.plot_1.set_ylim(-.5, y_lim) self.canvas.draw() except: pass #print('unable to update plot 1') # word length histogram try: self.plot_2.cla() hist_data = [ self.lexicon.word_lengths(i + 1) for i in range(self.lexicon.frequency_groups) ] self.wl_hist = self.plot_2.hist( hist_data, range=(1, self.lexicon.hard_max_length), stacked=False, color=_colors[:self.lexicon.frequency_groups]) self.plot_2.set_title('word lengths') self.canvas2.draw() except: pass #print('unable to update plot 2') try: # zipf scatter lengths, unigrams = zip(*self.lexicon.lengths_and_unigrams()) slope, intercept, r_value, p_value, std_err = stats.linregress( lengths, unigrams) unig_pred = intercept + (slope * np.arange(self.lexicon.hard_max_length)) sorted_unig = sorted([w.unigram for w in self.lexicon.words]) self.plot_3.set_ylim(sorted_unig[0] - 1, sorted_unig[-1] + 1) self.zipf_scatter.set_xdata(lengths) self.zipf_line.set_ydata(unig_pred) self.canvas3.draw() except: pass #print('unable to update plot 3') try: """ ks, ps = p_dist_to_lists(self.lexicon.seg_ps, sort_by_keys = True) self.plot_4[0].cla() self.phoneme_dist_bars = self.plot_4[0].bar(np.arange(len(ks)), ps, color = _colors[-3]) self.plot_4[0].set_ylim(0,.75 if max(ps) < .75 else 1) self.plot_4[0].set_xticks(np.arange(len(ks))) self.plot_4[0].set_xticklabels(ks) self.plot_4[0].set_title('seg. distribution in lexicon') # first/last seg info self.plot_4[1].cla() for i, edge_ent in enumerate(self.lexicon.edge_entropies()): new_bar = self.plot_4[1].bar((i * .5) + (1.5 * np.arange(self.lexicon.frequency_groups)), edge_ent, color = _colors[-2+i], width = .5) self.edge_ent_bars.append(new_bar) self.plot_4[1].set_xticks(.25 + np.arange(self.lexicon.frequency_groups) * 1.5) if self.lexicon.frequency_groups == 2: self.plot_4[1].set_xticklabels(['high frequency', 'low frequency']) else: self.plot_4[1].set_xticklabels(['group {0}'.format(i + 1) for i in range(self.lexicon.frequency_groups)]) self.plot_4[1].set_title('avg. information - first/last segment') self.plot_4[1].legend(labels = ['first', 'last']) self.canvas4.draw() """ ks, ps = p_dist_to_lists(self.lexicon.seg_ps, sort_by_keys=True) self.plot_4.cla() self.phoneme_dist_bars = self.plot_4.bar(np.arange(len(ks)), ps, color=_colors[-3]) self.plot_4.set_ylim(0, .75 if max(ps) < .75 else 1) self.plot_4.set_xticks(np.arange(len(ks))) self.plot_4.set_xticklabels(ks) self.plot_4.set_title('seg. distribution in lexicon') self.canvas4.draw() except: pass #print('unable to update plot 4') try: max_pe = 0 for i, line in enumerate(self.pos_ent_lines): pos_ent = self.lexicon.positional_entropy(which_group=i + 1) line.set_ydata(pos_ent) max_si = max(max_pe, max(pos_ent)) for y_lim in range(5, 30, 3): if y_lim > max_pe: break self.plot_5.set_ylim(-.5, y_lim) self.canvas5.draw() except: pass #print('unable to update plot 5') def step(self, n_steps=1): if self.evolution_steps == 0: self.lexicon.save('lex_{0}_{1}.txt'.format( self.lex_name_text.get(), self.evolution_steps)) for i in range(n_steps): self.evolution_steps += 1 self.evolution_steps_label['text'] = self.evolution_steps self.lexicon.change_segs(word_E=self.word_E(), symbol_E=self.symbol_E(), merger_p=self.merger_p()) print('step: {0} - total steps: {1}'.format( i + 1, self.evolution_steps)) if self.evolution_steps % 25 == 0: self.update() self.lexicon.save('lex_{0}_{1}.txt'.format( self.lex_name_text.get(), self.evolution_steps)) for i, w in enumerate(self.lexicon.words[:10]): print(i, w, w.frequency) self.update() self.lexicon.save('lex_{0}_{1}.txt'.format(self.lex_name_text.get(), self.evolution_steps)) def reset_lex(self): self.evolution_steps = 0 self.evolution_steps_label['text'] = self.evolution_steps self.lexicon = Lexicon(self.lexicon_size(), phones=self.n_symbols(), frequency_groups=self.lexicon.frequency_groups, hard_max_length=self.lexicon.hard_max_length, hard_start_length=self.hard_word_length()) # figure out minimum length needed for whole lexicon total_possible_forms = 0 for i in range(1, self.lexicon.hard_max_length): total_possible_forms += self.n_symbols()**i if total_possible_forms > len(self.lexicon): break self.min_len_needed[ 'text'] = 'max length needed for lexicon: {0}'.format(i) self.possible_forms['text'] = 'possible forms at length: {0}'.format( total_possible_forms) # zipf self.plot_3.cla() sorted_unig = sorted([w.unigram for w in self.lexicon.words]) self.plot_3.set_xlim(0, self.lexicon.hard_max_length) self.plot_3.set_ylim(sorted_unig[0] - 1, sorted_unig[-1] + 1) self.plot_3.set_title('word length and unigram word information') lengths, unigrams = zip(*self.lexicon.lengths_and_unigrams()) slope, intercept, r_value, p_value, std_err = stats.linregress( lengths, unigrams) unig_pred = intercept + (slope * np.arange(self.lexicon.hard_max_length)) self.zipf_scatter, = self.plot_3.plot(lengths, unigrams, 'o') self.zipf_line, = self.plot_3.plot( np.arange(self.lexicon.hard_max_length), unig_pred) self.update() def merger_p(self): return self.merger_p_slider.get() / 100 def symbol_E(self): try: symbol_E = float(self.symbol_E_text.get()) except: symbol_E = self.last_symbol_E self.last_symbol_E = symbol_E return symbol_E def word_E(self): try: word_E = int(self.word_E_text.get()) except: word_E = self.last_word_E self.last_word_E = word_E return word_E def lexicon_size(self): try: lexicon_size = int(self.lexicon_size_text.get()) except: symbol_E = self.last_lexicon_size self.last_lexicon_size = lexicon_size return lexicon_size def n_symbols(self): try: n_symbols = int(self.n_symbols_text.get()) except: n_symbols = self.last_n_symbols self.last_n_symbols = n_symbols return n_symbols def hard_word_length(self): try: hard_word_length = int(self.hard_word_length_text.get()) except: hard_word_length = self.last_hard_word_length self.last_hard_word_length = hard_word_length if hard_word_length < 0: hard_word_length = None return hard_word_length
import sys sys.path.append('.') # necessary to import local libraries import Ontology import Lexicon import CKYParser print "reading in Ontology" ont = Ontology.Ontology(sys.argv[1]) commutative_idxs = [ont.preds.index('and'), ont.preds.index('or')] print "predicates: " + str(ont.preds) print "types: " + str(ont.types) print "entries: " + str(ont.entries) print "reading in Lexicon" lex = Lexicon.Lexicon(ont, sys.argv[2]) print "surface forms: " + str(lex.surface_forms) print "categories: " + str(lex.categories) print "semantic forms: " + str(lex.semantic_forms) print "entries: " + str(lex.entries) print "instantiating CKYParser" parser = CKYParser.CKYParser(ont, lex, use_language_model=True) print "reading in data and beginning training test" d = parser.read_in_paired_utterance_semantics(sys.argv[3]) converged = parser.train_learner_on_semantic_forms(d, 10, reranker_beam=10) if not converged: raise AssertionError("Training failed to converge to correct values.") print "reading in data and beginning evaluation test"
def LoadCommon(): if not utils.DisableDB: InitDB() import Cache Cache.LoadSentenceDB() PipeLineLocation = ParserConfig.get("main", "Pipelinefile") FILE_ABS_PATH = os.path.dirname(os.path.abspath(__file__)) XLocation = FILE_ABS_PATH + '/' + os.path.dirname(PipeLineLocation) + "/" #XLocation = os.path.dirname(PipeLineLocation) + "/" FeaturefileLocation = os.path.join(XLocation, "../Y/feature.txt") GlobalmacroLocation = os.path.join(XLocation, "../Y/GlobalMacro.txt") # PunctuatefileLocation = os.path.join(XLocation, "../Y/LexY-EnglishPunctuate.txt") FeatureOntology.LoadFeatureOntology(FeaturefileLocation) systemfileolderthanDB = SystemFileOlderThanDB(XLocation) LoadPipeline(PipeLineLocation) if logging.root.isEnabledFor(logging.DEBUG): logging.debug("Runtype:" + ParserConfig.get("main", "runtype")) if logging.root.isEnabledFor(logging.DEBUG): logging.debug("utils.Runtype:" + utils.ParserConfig.get("main", "runtype")) Rules.LoadGlobalMacro(GlobalmacroLocation) # Lexicon.LoadLexicon(PunctuatefileLocation) for action in PipeLine: if action.startswith("FSA "): Rulefile = action[3:].strip() Rules.LoadRules(XLocation, Rulefile,systemfileolderthanDB) if action.startswith("DAGFSA "): Rulefile = action[6:].strip() Rules.LoadRules(XLocation, Rulefile,systemfileolderthanDB) if action.startswith("DAGFSA_APP "): #FUZZY Rulefile = action[10:].strip() Rules.LoadRules(XLocation, Rulefile,systemfileolderthanDB, fuzzy=True) if action.startswith("Lookup Spelling:"): Spellfile = action[action.index(":")+1:].strip().split(",") for spell in Spellfile: spell = spell.strip() if spell: Lexicon.LoadExtraReference(XLocation + spell, Lexicon._LexiconCuobieziDict) if action.startswith("Lookup Encoding:"): Encodefile = action[action.index(":")+1:].strip().split(",") for encode in Encodefile: encode = encode.strip() if encode: Lexicon.LoadExtraReference(XLocation + encode, Lexicon._LexiconFantiDict) if action.startswith("Lookup Main:"): Mainfile = action[action.index(":")+1:].strip().split(",") for main in Mainfile: main = main.strip() if main: Lexicon.LoadMainLexicon(XLocation + main) if action.startswith("Lookup SegmentSlash:"): Slashfile = action[action.index(":")+1:].strip().split(",") for slash in Slashfile: slash = slash.strip() if slash: Lexicon.LoadSegmentSlash(XLocation + slash) if action.startswith("Lookup Lex:"): Lexfile = action[action.index(":")+1:].strip().split(",") for lex in Lexfile: lex = lex.strip() if lex: Lexicon.LoadLexicon(XLocation + lex) # (O.O) if action.startswith("Stemming:"): Stemfile = action[action.index(":") + 1:].strip().split(",") inf = Stemfile[0].strip() Rules.LoadRules(XLocation, inf, systemfileolderthanDB) Lexicon.LoadSuffix(XLocation + inf, inf) for stem in Stemfile[1:]: stem = stem.strip() if stem: Lexicon.LoadLexicon(XLocation + stem, lookupSource=LexiconLookupSource.stemming) if action.startswith("Lookup Compound:"): Compoundfile = action[action.index(":")+1:].strip().split(",") for compound in Compoundfile: compound = compound.strip() if compound: Lexicon.LoadLexicon(XLocation + compound, lookupSource=LexiconLookupSource.Compound) if action.startswith("Lookup defLex:"): Compoundfile = action[action.index(":")+1:].strip().split(",") for compound in Compoundfile: compound = compound.strip() if compound: Lexicon.LoadLexicon(XLocation + compound, lookupSource=LexiconLookupSource.defLex) if action.startswith("Lookup External:"): Externalfile = action[action.index(":")+1:].strip().split(",") for external in Externalfile: external = external.strip() if external: Lexicon.LoadLexicon(XLocation + external,lookupSource=LexiconLookupSource.External) if action.startswith("Lookup oQcQ:"): oQoCfile = action[action.index(":")+1:].strip().split(",") for oQoC in oQoCfile: oQoC = oQoC.strip() if oQoC: Lexicon.LoadLexicon(XLocation + oQoC,lookupSource=LexiconLookupSource.oQcQ) if action.startswith("Lookup IE:"): compositefile = action[action.index(":")+1:].strip().split(",") for composite in compositefile: comp = composite.strip() if comp: Lexicon.LoadCompositeKG(XLocation + comp) Lexicon.LoadSegmentLexicon() UpdateSystemFileFromDB(XLocation) if not utils.DisableDB: CloseDB(utils.DBCon) if ParserConfig.get("main", "runtype") == "Debug": logging.debug("Start writing temporary rule files") Rules.OutputRuleFiles(ParserConfig.get("main", "compiledfolder")) FeatureOntology.OutputFeatureOntologyFile(ParserConfig.get("main", "compiledfolder")) logging.debug("Start writing temporary lex file.") #Lexicon.OutputLexiconFile(ParserConfig.get("main", "compiledfolder")) #Rules._PreProcess_RuleIDNormalize() logging.debug("Done of LoadCommon!")
except KeyError: index = self.labels_dict['O'] labels_list.append(index) return labels_list def show_data_info(self): """ :return: None 显示Data对象的信息, 包括句子最大长度, 单个句子中words的最大长度, 输入文件中句子数目 """ print('Data信息:') print('句子最大长度:', self.properties['max_sentence_length']) print('单个句子中最大单词个数:', self.properties['max_words_number']) print('句子数目:', self.properties['sentence_number']) if __name__ == '__main__': tic = time.time() print('测试Data类...') lex = Lexicon.Lexicon() print(1) path = r'NERData\MSRA\msra_train_bio.txt' data = Data(path, lex) data.show_data_info() print(data.chars_dict, data.labels_dict, sep='\n') for i in data.data[0]: print(i) toc = time.time() print('运行时间:', toc - tic)
def main(): # Load parameters from command line. ontology_fn = FLAGS_ontology_fn lexicon_fn = FLAGS_lexicon_fn train_pairs_fn = FLAGS_train_pairs_fn model_fn = FLAGS_model_fn validation_pairs_fn = FLAGS_validation_pairs_fn lexicon_embeddings = FLAGS_lexicon_embeddings max_epochs = FLAGS_max_epochs epochs_between_validations = FLAGS_epochs_between_validations lexicon_weight = FLAGS_lexicon_weight allow_merge = True if FLAGS_allow_merge == 1 else False perform_type_raising = True if FLAGS_perform_type_raising == 1 else False verbose = FLAGS_verbose use_condor = True if FLAGS_use_condor == 1 else False condor_target_dir = FLAGS_condor_target_dir condor_script_dir = FLAGS_condor_script_dir assert validation_pairs_fn is None or max_epochs >= epochs_between_validations assert not use_condor or (condor_target_dir is not None and condor_script_dir is not None) assert max_epochs >= 0 or train_pairs_fn is not None o = Ontology.Ontology(ontology_fn) l = Lexicon.Lexicon( o, lexicon_fn, word_embeddings_fn=lexicon_embeddings, ) p = CKYParser.CKYParser(o, l, allow_merge=allow_merge, lexicon_weight=lexicon_weight, perform_type_raising=perform_type_raising) # hyperparameter adjustments p.max_multiword_expression = 1 p.max_missing_words_to_try = 0 # basically disallows polysemy that isn't already present in lexicon # Train the parser one epoch at a time, examining validation performance between each epoch. if max_epochs > 0: train_data = p.read_in_paired_utterance_semantics(train_pairs_fn) val_data = p.read_in_paired_utterance_semantics(validation_pairs_fn) \ if validation_pairs_fn is not None else None print "finished instantiating parser; beginning training" for epoch in range(0, max_epochs, epochs_between_validations): if val_data is not None: acc_at_1 = get_performance_on_pairs(p, val_data) print "validation accuracy at 1 for epoch " + str( epoch) + ": " + str(acc_at_1) converged = p.train_learner_on_semantic_forms( train_data, epochs=epochs_between_validations, epoch_offset=epoch, reranker_beam=1, verbose=verbose, use_condor=use_condor, condor_target_dir=condor_target_dir, condor_script_dir=condor_script_dir) if converged: print "training converged after epoch " + str(epoch) break if val_data is not None: acc_at_1 = get_performance_on_pairs(p, val_data) print "validation accuracy at 1 at training stop: " + str(acc_at_1) # Write the parser to file. print "writing trained parser to file..." with open(model_fn, 'wb') as f: pickle.dump(p, f) print "... done"
from Lexicon import * from GUI import * if __name__ == '__main__': print('\n\n\n') n_words = 1000 n_phones = 10 #phones = {'a' : 10, 'b' : 5, 'c' : 5, 'd' : 1} l = Lexicon(n_words, phones=n_phones, frequency_groups=2, hard_start_length=6) EvolGUI(l)
def DynamicPipeline(NodeList, schema): WinningRules = {} Dag = DependencyTree.DependencyTree() for action in PipeLine: if action == "segmentation": continue if action == "apply lexicons": continue if action == "SEGMENTATION COMPLETE" and schema == "segonly": break if action == "SHALLOW COMPLETE" and schema == "shallowcomplete": break #applies caseab, caseAb, caseaB, or caseAB if action == "CASES": Lexicon.ApplyCasesToNodes(NodeList) if action.startswith("FSA "): Rulefile = action[3:].strip() WinningRules.update(MatchAndApplyRuleFile(NodeList, Rulefile)) # if NodeList: # logging.debug(NodeList.root(True).CleanOutput(KeepOriginFeature=True).toJSON()) # if action.startswith("lookup"): # lookupSourceName = action[6:].strip() # for x in LexiconLookupSource: # if x.name == lookupSourceName: # Lexicon.LexiconLookup(NodeList, x) # # if action == "APPLY COMPOSITE KG": # Lexicon.ApplyCompositeKG(NodeList) if action.startswith("Lookup defLex:") or action.startswith("Lookup External:") \ or action.startswith("Lookup oQcQ") or action.startswith("Lookup Compound:"): lookupSourceName = action[6:action.index(":")].strip() for x in LexiconLookupSource: if x.name == lookupSourceName: Lexicon.LexiconLookup(NodeList, x) if action.startswith("Lookup IE"): Lexicon.ApplyCompositeKG(NodeList) # # if action == "TRANSFORM DAG": # Dag.transform(NodeList) # logging.info("Dag:{}".format(Dag)) if action.startswith("DAGFSA "): if len(Dag.nodes) == 0: try: Dag.transform(NodeList) except Exception as e: logging.error("Failed to transfer the NodeList to Dag due to:\n{}".format(e)) return NodeList, Dag, WinningRules Rulefile = action[7:].strip() WinningRules.update(MatchAndApplyDagRuleFile(Dag, Rulefile)) if action.startswith("DAGFSA_APP "): if len(Dag.nodes) == 0: try: Dag.transform(NodeList) except Exception as e: logging.error("Failed to transfer the NodeList to Dag due to:\n{}".format(e)) return NodeList, Dag, WinningRules Rulefile = action[10:].strip() WinningRules.update(MatchAndApplyDagRuleFile(Dag, Rulefile)) return NodeList, Dag, WinningRules
for suffix in ['DBSReader', 'DBSWriter']: if url.endswith(suffix): url=url[0:-len(suffix)] readUrl = url + 'DBSReader' writeUrl = url + 'DBSWriter' readApi = DbsApi(url=readUrl) writeApi = DbsApi(url=writeUrl) dataset = options.dataset if options.new_location: new_location = options.new_location ###sanitize input # dataset name Lexicon.dataset(dataset) # PNN if new_location: Lexicon.cmsname(new_location) # process dataset by blocks blockDicts = readApi.listBlocks(dataset=dataset, detail=True) for block in blockDicts: blName = block['block_name'] location = block['origin_site_name'] logging.debug('block %s at location: %s' % (blName, location)) if new_location: writeApi.updateBlockSiteName(block_name=blName, origin_site_name=new_location) logging.debug('location set to %s' % (new_location))
def transform(self, nodelist): #Transform from SentenceLinkedList to Depen if logging.root.isEnabledFor(logging.DEBUG): logging.debug("Start to transform:\n {}".format( jsonpickle.dumps(nodelist))) self.fulltext = nodelist.root().text self.fullnorm = nodelist.root().norm self.fullatom = nodelist.root().atom root = nodelist.head if root.text == '' and utils.FeatureID_JS in root.features: root = root.next #ignore the first empty (virtual) JS node temp_subgraphs = [] # Collect all the leaf nodes into self.nodes. while root is not None: #each "root" has a tree, independent from others. node = root nodestack = set() while node: if node.sons: if len(node.sons) == 2 and len(node.text) == 2 and len( node.sons[0].text) == 1 and len( node.sons[1].text) == 1: DanziDict.update({node: node.sons}) if node.next: nodestack.add(node.next) node = node.sons[0] else: if not (node.text == '' and utils.FeatureID_JM in node.features): self.nodes.update({node.ID: copy.deepcopy(node) }) # add leaf node to self.nodes. if node == root: #if node is in root level, don't get next. if nodestack: node = nodestack.pop() else: node = None continue node = node.next if node is None and nodestack: node = nodestack.pop() if not (root.text == '' and utils.FeatureID_JM in root.features): temp_subgraphs.append(SubGraph(root)) self._roots.append(root.ID) root = root.next #filling up the subgraphs. while temp_subgraphs: subgraph = temp_subgraphs.pop() node = subgraph.startnode if node.sons: subnode = node.sons[0] nodestack = set() while subnode: if subnode.sons: if utils.FeatureID_H not in subnode.features: temp_subgraphs.append(SubGraph( subnode)) # non-leaf, non-H. it is a subgraph. subgraph.leaves.append( [subnode.ID, subnode.UpperRelationship]) subnode = subnode.next if subnode is None and nodestack: subnode = nodestack.pop() else: if subnode.next: nodestack.add(subnode.next) subnode = subnode.sons[0] else: # this is a leaf node. # use the copy in self.nodes to apply feature modification if utils.FeatureID_H in subnode.features: subgraph.headID = subnode.ID self.nodes[subnode.ID].features.update( subgraph.startnode.features) Lexicon.ApplyWordLengthFeature( self.nodes[subnode.ID]) else: if not (subnode.text == '' and utils.FeatureID_JM in subnode.features): subgraph.leaves.append( [subnode.ID, subnode.UpperRelationship]) subnode = subnode.next if subnode is None and nodestack: subnode = nodestack.pop() else: subgraph.headID = subgraph.startnode.ID self._subgraphs.append(subgraph) # add to the permanent subgraphs # now set the roots, from the top node to the head. for i in range(len(self._roots)): if self._roots[i] not in self.nodes: for _subgraph in self._subgraphs: if _subgraph.startnode.ID == self._roots[i]: self._roots[i] = _subgraph.headID # now process the non-leaf, non-H points. # copy information to self.graph for subgraph in self._subgraphs: for relation in subgraph.leaves: if relation[0] not in self.nodes: for _subgraph in self._subgraphs: if _subgraph.startnode.ID == relation[0]: relation[0] = _subgraph.headID #print("The previous ID" + str(relation[0]) + " is replaced by head ID" + str(_subgraph.headID)) break self._AddEdge(relation[0], relation[1], subgraph.headID) index = 0 prevnode = None for node in sorted(self.nodes.values(), key=operator.attrgetter("StartOffset")): node.Index = index if prevnode: self._AddEdge(node.ID, "RIGHT", prevnode.ID) self._AddEdge(prevnode.ID, "LEFT", node.ID) prevnode = node index += 1 self._MarkNext() self.root = self._roots[0] if logging.root.isEnabledFor(logging.DEBUG): logging.debug("End of transform:\n {}".format(self))
# def LoopTest2(n): # for _ in range(n): # old_Tokenize_cn('響著錄中文规则很长 very long , 为啥是不?') if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(message)s') logging.info("Start") # import ProcessSentence # ProcessSentence.LoadCommon() # too heavy to load for debugging FeatureOntology.LoadFeatureOntology('../../fsa/Y/feature.txt') Lexicon.LoadSegmentLexicon() XLocation = '../../fsa/X/' Lexicon.LoadExtraReference(XLocation + 'CuobieziX.txt', Lexicon._LexiconCuobieziDict) Lexicon.LoadExtraReference(XLocation + 'Fanti.txt', Lexicon._LexiconFantiDict) main_x = Tokenize('科普:。,?带你看懂蜀绣冰壶比赛') #old_Tokenize_cn('很少有科普:3 minutes 三分钟带你看懂蜀绣冰壶比赛') import cProfile, pstats cProfile.run("LoopTest1(100)", 'restatslex') pstat = pstats.Stats('restatslex') pstat.sort_stats('time').print_stats(10)
def engine(): ########################################### # EDIT HERE: Input company and dates to search ########################################### company = 'facebook' year = '2018' month = '05' today = 11 # gather lexicon/naive bayes trained data sentLex = Lexicon.sentLexicon() freqPOS, freqNEG = Naive_Bayes.frequency() countPOS, countNEG, countPOSNEG = Naive_Bayes.count(freqPOS, freqNEG) # reporting data data = "" artData = "" # graphing data days = [] lexPOSGraph = [] lexNEGGraph = [] nbPOSGraph = [] nbNEGGraph = [] # gather articles and analyze sentiment for entire month up to date for day in range(1, today + 1): day = str(day) articles = newsArticles(company, year, month, day) # article data for report artData += 'Company: {}, Month: {}, Day: {}, Year: {}\n\n'.format( company, month, day, year) for article in articles: artData += article + "\n\n" lexPOS, lexNEG, lexNEUT = runLexicon(articles, sentLex) nbPOS, nbNEG, nbNEUT = runNaiveBayes(articles, freqPOS, freqNEG, countPOS, countNEG, countPOSNEG) result = suggestion(lexPOS, lexNEG, nbPOS, nbNEG) # graphing data days.append(day) lexPOSGraph.append(lexPOS) lexNEGGraph.append(lexNEG) nbPOSGraph.append(nbPOS) nbNEGGraph.append(nbNEG) # display data print('Company: {}, Month: {}, Day: {}, Year: {}'.format( company, month, day, year)) print("Lexicon - POS: {}, NEG: {}, NEUT: {}".format( lexPOS, lexNEG, lexNEUT)) print("Naive Bayes - POS: {}, NEG: {}, NEUT: {}".format( nbPOS, nbNEG, nbNEUT)) print("Results: " + result) print("______________________________________________________") # sentiment data for report data += '\nMonth: {}, Day: {}, Year: {}\n'.format(month, day, year) data += "Results: " + result + "\n" data += "Lexicon - POS: {},\tNEG: {},\tNEUT: {}\n".format( lexPOS, lexNEG, lexNEUT) data += "Naive Bayes - POS: {},\tNEG: {},\tNEUT: {}\n\n".format( nbPOS, nbNEG, nbNEUT) data += "______________________________________________________\n" graph(days, month, year, company, lexPOSGraph, lexNEGGraph, nbPOSGraph, nbNEGGraph) print("Graphs generated...") report(company, str(today), month, year, data, artData) print("Report generated...")
whiteList = whiteList.strip().split(',') # make sure each item in the list is a valid cms node name # or possibly a shortcut like T3 for site in blackList: try: Lexicon.cmsname(site) except Exception, text: msg = "ERROR in GRID.se_black_list: %s\n" % blackList msg += "%s\n'%s' is not a valid Phedex Node Name" % (text, site) raise CrabException(msg) for site in whiteList: try: Lexicon.cmsname(site) except Exception, text: msg = "ERROR in GRID.se_white_list: %s\n" % whiteList msg += "%s\n'%s' is not a valid Phedex Node Name" % (text, site) raise CrabException(msg) def parseIntoList(param): """ to be used to make sure that one crab config parameter is usable as a list of strings, eve if it is a string with comma insides in the config. file """ if type(param) == type("string"): list = param.split(',') for item in list: item = item.strip()
for emo in emotions: d = r"Thesis - Inputdata\2018-EI-oc-En-" + emo + "-dev-and-train.txt" training_data_paths.append(d) test_data_paths = [] for emo in emotions: d = r"Thesis - Inputdata\2018-EI-oc-En-" + emo + "-test-gold.txt" test_data_paths.append(d) ## # Insert path to the Lexicon Files: print("Loading Lexicons") path_emo_lex = r"Lexicons\NRC-Hashtag-Emotion-Lexicon-v0.2\NRC-Hashtag-Emotion-Lexicon-v0.2.txt" path_sen_lex = r"Lexicons\NRC-Hashtag-Sentiment-Lexicon-v1.0\HS-unigrams.txt" # Generating the lexicons lexi = l.load_lexicon(l.datareader(path_emo_lex)) lexi += l.load_lexicon(l.datareader(path_sen_lex, Elex=False), Elex=False) print("Complete") ## # Insert paths/type of embeddings(bert, Glove, Word2vec: Skipgram) path_bert = 'book_corpus_wiki_en_cased' path_glove = r"Thesis - Embeddings\GloVe\glove.6B.300d.w2vformat.txt" path_word2vec = r"Thesis - Embeddings\Word2Vec\GoogleNews-vectors-negative300.bin" # loading the embeding methods glove = em.WordToVec("glove", path_glove) word_two_vec = em.WordToVec("word2vec", path_word2vec) bert = em.Bert("bert", path_bert) embeds = [glove, word_two_vec, bert]
for suffix in ['DBSReader', 'DBSWriter']: if url.endswith(suffix): url = url[0:-len(suffix)] readUrl = url + 'DBSReader' writeUrl = url + 'DBSWriter' readApi = DbsApi(url=readUrl) writeApi = DbsApi(url=writeUrl) dataset = options.dataset if options.new_location: new_location = options.new_location ###sanitize input # dataset name Lexicon.dataset(dataset) # PNN if new_location: Lexicon.cmsname(new_location) # process dataset by blocks blockDicts = readApi.listBlocks(dataset=dataset, detail=True) for block in blockDicts: blName = block['block_name'] location = block['origin_site_name'] logging.debug('block %s at location: %s' % (blName, location)) if new_location: writeApi.updateBlockSiteName(block_name=blName, origin_site_name=new_location)
def Reload(self, ReloadTask): utils.InitDB() PipeLineLocation = ParserConfig.get("main", "Pipelinefile") XLocation = os.path.dirname(PipeLineLocation) + "/" Reply = "Lexicon/Rule/Pipeline:" systemfileolderthanDB = ProcessSentence.SystemFileOlderThanDB( XLocation) if ReloadTask.lower() == "/lexicon": logging.info("Start loading lexicon...") Lexicon.ResetAllLexicons() # ProcessSentence.LoadCommonLexicon(XLocation) for action in ProcessSentence.PipeLine: if action.startswith("Lookup Spelling:"): Spellfile = action[action.index(":") + 1:].strip().split(",") for spell in Spellfile: spell = spell.strip() if spell: Lexicon.LoadExtraReference( XLocation + spell, Lexicon._LexiconCuobieziDict) if action.startswith("Lookup Encoding:"): Encodefile = action[action.index(":") + 1:].strip().split(",") for encode in Encodefile: encode = encode.strip() if encode: Lexicon.LoadExtraReference( XLocation + encode, Lexicon._LexiconFantiDict) if action.startswith("Lookup Main:"): Mainfile = action[action.index(":") + 1:].strip().split(",") for main in Mainfile: main = main.strip() if main: Lexicon.LoadMainLexicon(XLocation + main) if action.startswith("Lookup SegmentSlash:"): Slashfile = action[action.index(":") + 1:].strip().split(",") for slash in Slashfile: slash = slash.strip() if slash: Lexicon.LoadSegmentSlash(XLocation + slash) if action.startswith("Lookup Lex:"): Lexfile = action[action.index(":") + 1:].strip().split(",") for lex in Lexfile: lex = lex.strip() if lex: Lexicon.LoadLexicon(XLocation + lex) if action.startswith("Lookup defLex:"): Compoundfile = action[action.index(":") + 1:].strip().split(",") for compound in Compoundfile: compound = compound.strip() if compound: Lexicon.LoadLexicon( XLocation + compound, lookupSource=LexiconLookupSource.defLex) if action.startswith("Lookup External:"): Externalfile = action[action.index(":") + 1:].strip().split(",") for external in Externalfile: external = external.strip() if external: Lexicon.LoadLexicon( XLocation + external, lookupSource=LexiconLookupSource.External) if action.startswith("Lookup oQcQ:"): oQoCfile = action[action.index(":") + 1:].strip().split(",") for oQoC in oQoCfile: oQoC = oQoC.strip() if oQoC: Lexicon.LoadLexicon( XLocation + oQoC, lookupSource=LexiconLookupSource.oQcQ) Lexicon.LoadSegmentLexicon() Reply += "Reloaded lexicon at " + str(datetime.now()) if ReloadTask.lower() == "/rule": logging.info("Start loading rules...") #Rules.ResetAllRules() #ProcessSentence.WinningRuleDict.clear() GlobalmacroLocation = os.path.join(XLocation, "../Y/GlobalMacro.txt") Rules.LoadGlobalMacro(GlobalmacroLocation) for action in ProcessSentence.PipeLine: if action.startswith("FSA "): Rulefile = action[3:].strip() RuleLocation = os.path.join(XLocation, Rulefile) if RuleLocation.startswith("."): RuleLocation = os.path.join( os.path.dirname(os.path.realpath(__file__)), RuleLocation) if not systemfileolderthanDB or not Rules.RuleFileOlderThanDB( RuleLocation): Rules.LoadRules(XLocation, Rulefile, systemfileolderthanDB) elif action.startswith("DAGFSA_APP "): # FUZZY Rulefile = action[10:].strip() RuleLocation = os.path.join(XLocation, Rulefile) if RuleLocation.startswith("."): RuleLocation = os.path.join( os.path.dirname(os.path.realpath(__file__)), RuleLocation) if not systemfileolderthanDB or not Rules.RuleFileOlderThanDB( RuleLocation): Rules.LoadRules(XLocation, Rulefile, systemfileolderthanDB, fuzzy=True) # Rules.LoadRules(XLocation, Rulefile, systemfileolderthanDB, fuzzy=True) elif action.startswith("DAGFSA "): Rulefile = action[6:].strip() RuleLocation = os.path.join(XLocation, Rulefile) if RuleLocation.startswith("."): RuleLocation = os.path.join( os.path.dirname(os.path.realpath(__file__)), RuleLocation) if not systemfileolderthanDB or not Rules.RuleFileOlderThanDB( RuleLocation): Rules.LoadRules(XLocation, Rulefile, systemfileolderthanDB) Reply += "Reloaded rules at " + str(datetime.now()) if ReloadTask.lower() == "/pipeline": logging.info("Start loading pipeline...") Rules.ResetAllRules() ProcessSentence.PipeLine = [] ProcessSentence.LoadCommon() Reply += "Reloaded pipeline at " + str(datetime.now()) ProcessSentence.UpdateSystemFileFromDB(XLocation) self.send_response(200) self.send_header('Content-type', "text/html; charset=utf-8") self.end_headers() self.wfile.write(Reply.encode("utf-8")) utils.CloseDB(utils.DBCon)