def build_words(questions, configuration): config = configuration # Build an empty dictionary for words word_dict = dict.Dictionary() word_dict.initialize_unknown_token() # Placeholder for returned dataset x = [] for question in questions: # Temporary row row = [] for token in question.tokens: # First add "new" word to dict token_id = word_dict.add(token) # Add token_id to the data row.append(token_id) # If padding is enabled - fill sequence with unknown tokens if config.use_word_padding: while len(row) < config.max_sequence_length: row.append(word_dict.get_index(constants.unknown_token)) # Add new row to the data x.append(row) # Transform x to numpy array x = np.array(x) # Set configuration value for word_dict config.word_dict = word_dict config.word_dict.add("competing") config.word_dict.add("compete") config.word_dict.add("activities") config.word_dict.add("activity") config.word_dict.add("year’s") config.word_dict.add("r&d") config.word_dict.add("t&e") config.word_dict.add("get") config.word_dict.add("located") config.word_dict.add("segment") config.word_dict.add("minimum") config.word_dict.add("maximum") return x
def main(): parser = argparse.ArgumentParser( description='Clasifying a sentence with LSTM.') parser.add_argument('-m', '--model', required=True, help='the path to the model file.') parser.add_argument('-s', '--source-sentence', required=True, help='the source sentence to classify') parser.add_argument('-t', '--target-sentence', required=True, help='the target sentence to classify') parser.add_argument('-d', '--dictionary', required=True, help='the dictionary/vocabulary.') parser.add_argument('-g', '--gpuid', required=False, default=-1, help='the ID of the GPU to use.') args = parser.parse_args() deviceid = -1 if int(args.gpuid) > -1 and torch.cuda.is_available(): deviceid = int(args.gpuid) print('Using GPU ' + str(deviceid)) torch.cuda.set_device(deviceid) if os.path.exists(args.dictionary): dictionary = Dict.Dictionary(os.path.realpath(args.dictionary)) dictionary.load_dictionary() data_pr = DataPreprocessor(dictionary) else: print("Path not found: ", args.dictionary) exit(1) if not os.path.exists(args.model): print("Path not found: ", args.model) exit(1) print( test(args.source_sentence, args.target_sentence, data_pr, args.model, deviceid))
def build_types(questions, type, configuration): config = configuration y = [] # Build an empty dictionary for words label_dict = dict.Dictionary() print("label_dict") print(label_dict.idx2str) for question in questions: if type == "cnn_question": # Fill dictionary with values label_dict.add(question.question_type) else: # Fill dictionary with values label_dict.add(question.answer_type) config.label_dict = label_dict # Build label data for question in questions: row = [] for label in config.label_dict.idx2str: if type == "cnn_question": if label == question.question_type: row.append(1) else: row.append(0) else: if label == question.answer_type: row.append(1) else: row.append(0) y.append(row) y = np.array(y) return y
def build_words(questions, configuration): config = configuration # Build an empty dictionary for words word_dict = dict.Dictionary() print("Dict") print(word_dict.idx2str) # Placeholder for returned dataset x = [] for question in questions: # Temporary row row = [] for token in question.tokens: # First add "new" word to dict token_id = word_dict.add(token) # Add token_id to the data row.append(token_id) # If padding is enabled - fill sequence with unknown tokens if config.use_word_padding: while len(row) < config.max_sequence_length: row.append(word_dict.get_index(constants.unknown_token)) # Add new row to the data x.append(row) # Transform x to numpy array x = np.array(x) # Set configuration value for word_dict config.word_dict = word_dict return x
def build_labels(questions, configuration): config = configuration # Build an empty dictionary for words label_dict = dict.Dictionary() print("label_dict") print(label_dict.idx2str) # Add unknown label marker if label padding is enabled if config.use_label_padding: label_dict.add(constants.unknown_label) # Placeholder for returned labels y = [] for question in questions: # Temporary row row = [] for label in question.semantic_labels: # First add "new" word to dict label_id = label_dict.add(label) # Add token_id to the data row.append(label_id) # Add new row to the data y.append(row) # Transform y to numpy array y = np.array(y) # Set configuration value for word_dict config.label_dict = label_dict return y
def __init__(self, words_filename, min_word_len, board_size): self.dictionary = dictionary.Dictionary(words_filename, min_word_len, MAX_WORD_LEN) self.size = board_size self.gen_board()
def rest_api_call(question): config = Variables() constant = constants words = [ '<UNK>', 'how', 'is', 'our', 'financial', 'performance', 'versus', 'analyst', 'estimates', 'which', 'market', 'units', 'generate', 'the', 'most', 'cloud', 'revenue', 'have', 'had', 'highest', 'growth', 'rates', 'region', 'generates', 'countries', 'generated', 'service', 'country', 'has', 'in', 'past', 'half', 'year', 'unit', 'least', 'organization', 'last', 'quarter', 'location', 'for', 'locations', 'greater', 'than', 'ten', 'million', 'show', 'all', 'that', 'profits', 'bigger', '1', 'are', 'top', 'regarding', 'many', 'more', '10', 'percent', 'software', '5', 'on-premise', 'was', 'what', 'two', 'quarters', 'during', 'us', 'development', 'of', 'throughout', 'three', 'margin', 'product', 'objects', 'products', 'rolling', 'four', 'germany', 'rate', 'compared', 'to', 'oracle', 'tell', 'me', 'key', 'driver', 'who', 'does', 'cost', 'structure', 'united', 'states', 'look', 'like', 'travel', 'and', 'entertainment', 'expenses', 'much', 'license', 'over', 'hana', 'per', 'quarterly', 'services', 'offerings', 'total', 'profit', 'first', 'north', 'america', 'this', 'years', 'trend', 'apj', 'evolved', 'biggest', 'based', 'on', 'with', 'emea', 'share', 'decreasing', 'digits', 'compare', 'gross', 'between', 'france', 'italy', 'categories', 'marketing', 'costs', 'differences', 'austria', 'calculate', 'each', 'stake', '2016', 'display', 'high', 'percentage', 'difference', 'influencer', 'operating', 'dach', 'lowest', 'earned', 'across', 'company', 'q1', 'worst', 'bottom', '3', 'relation', 'impact', 'looks', 'traffic', 'same', 'indonesia', 'revenues', 'higher', 'figures', 'one', 'billion', 'dollars', 'great', 'britain', 'china', 'comparison', 'regions', 'americas', 'double', 'digit', 'do', 'not', 'achieved', 'rising', 'increasing', 'poland', 'number', 'a', 'lower', 'whole', 'sap', 'saudi', 'arabia', 'profitability', 'japan', 'spend', 'croatia', 'research', 'denmark', 'latin', 'annually', 'hardware', 'monthly', 'run', '4', 'spain', 'india', 'europe', 'strongest', 'split', 'by', 'sector', 'profitable', 'business', 'margins', 'switzerland', 'belonging', 'south', 'numbers', 'runs', 'were', 'best', 'performing', 'their', 'gains', '2017', 'distribution', 'africa', 'stagnating', 'month', '2', 'organizations', 'having', 'positive', 'every', 's4', '2015', '2014', 'combined', 'q2', 'caused', 'erp', 'usa', 'canada', 'q3', 'q4', 'five', 'deals', 'pipeline', 'facebook', 'dollar', 'sales', 'declining', 'salesperson', 'closed', 'salespersons', 'salesmen', 'forecast', 'employee', 'sale', 'michael', 'scott', 'employees', 'fulfilled', 'quotas', 'quota', 'goal', 'did', 'reach', 'brazil', 'current', 'quote', 'close', 'ratio', 'target', 'opportunities', 'currently', 'bookings', 'average', 'deal', 'customers', 'we', 'companies', 'consider', 'buy', 'boardroom', 'want', 'thinks', 'about', 'buying', 'missed', 'next', 'volume', 'bought', 'new', 'won', 'as', 'well', 'ariba', 'concur', 'amount', 'salesman', 'sold', 'licences', 'often', 'months', 'digital', 'r3', 'led', 'contracts', 'resulted', 'end', 'fourth', 'since', 'licenses', 'within', 'selling', 'argentina', 'now', 'john', 'schneider', 'six', 'contract', 'volkswagen', 'sealed', '80', 'values', 'resigned', 'from', '6', 'customer', 'loyalty', 'nestle', 'volumes', '250', 'person', 'russia', 'leads', 'turned', 'into', 'sweden', 'items', 'opportunity', 'stage', 'booking', 'weakest', 'related', 'holds', 'promises', 'promising', 'iot', 'mexico', 'asia', 'keeps', 'his', 'promise', 'keep', 'analytics', 'analytic', 'conversion', 'responsible', 'thyssen', 'krupp', 'account', 'executive', 'persons', 'lost', 'booked', 'starting', 'its', 'working', 'employed', 'open', 'or', 'representatives', 'norway', 'name', 'representative', 'todd', 'packer', 'hr', 'successfactors', 'audi', 'australia', 'porsche', 'peru', 'worldwide', 'seven', 'names', 'fifty', 'an', 'at&t', 'verizon', 'samsung', 'headcount', 'people', 'israel', 'fte', 'palo', 'alto', 'department', 'developed', 'at', 'fully', 'loaded', 'plan', '2019', 'satisfaction', 'happiest', 'below', '95', 'dwight', 'schrute', 'managing', 'managed', 'wage', 'managers', 'salary', 'position', 'level', 'newtown', 'square', 'early', 'talents', 'walldorf', 'learning', 'completions', 'online', 'courses', 'completed', 'hired', 'fluctuation', 'thousand', 'women', 'management', 'positions', 'leadership', 'manager', 'leader', 'men', 'managerial', 'responsibility', 'tasks', 'growing', 'strong', 'fewest', 'external', 'workforce', 'consulting', 'non', 'billable', 'fastest', 'job', 'role', 'diversity', 'gender', 'among', 'workers', 'hirings', 'so', 'far', 'joined', 'salaries', 'incomes', 'students', 'bangalore', 'seattle', 'paris', '100', 'less', 'beijing', 'finance', 'administration', 'innovation', 'human', 'resources', 'leaders', 'leading', 'frequent', 'hold', 'grouped', 'age', 'movement', 'bill', 'mcdermott', 'greece', 'subordinates', 'finished', 'jimmy', 'kimmel', 'andy', 'bernard', 'where', 'departments', 'available', 'developer', 'vacant', 'advertisement', 'earn', 'money', 'lot', 'christian', 'klein', 'luka', 'mueller', 'head', 'been', 'signed', 'unlimited', 'conditions', 'mee', 'possibilities', 'training', 'sessions', 'offered', 'nordics', 'singapore', 'none', '50000', 'older', '30', '50', 'l3', 'threatening', 'competitors', 'competitor', 'threats', 'main', 'ibm', 'microsoft', 'also', 'facing', 'gaining', 'momentum', 'against', 'netsuite', 'fiercest', 'workday', 'salesforce', 'in-memory', 'database', 'investments', 'earnings', 'budget', 'list', 'dangerous', 'carries', 'invested', 'boosting', 'better', 'stock', 'price', 'evaluation', 'index', 'exchange', 'google', 'january', 'december', 'may', 'right', 'doing', 'prices', 'introduction', '8', 'value', 'capitalization', 'today', 'increased', 'saps', 'latest', 'ibms', 'indices', 'apple', 'evaluate', 'mercedes', 'competing', 'compete', 'activities', 'activity', 'year’s', 'r&d', 't&e', 'get', 'located', 'segment', 'minimum', 'maximum' ] labels = [ 'o', 'mea', 'cmp', 'res', 'argm', 'b_where', 'i_where', 'oper', 'grpby' ] # Build an empty dictionary for words and labels word_dict = dict.Dictionary() word_dict.initialize_unknown_token() label_dict = dict.Dictionary() for word in words: word_dict.add(word) config.word_dict = word_dict for label in labels: label_dict.add(label) config.label_dict = label_dict # Load word embeddings that map words in some language to high-dimensional vectors emb = embeddings.load(path=constant.data_folder, file=config.embedding_data, configuration=config) # Transpose embedding keys to corresponding keys in dictionary num_embeddings = embeddings.transpose(embeddings=emb, configuration=config) # Get only numeric vectors of embeddings vector_embeddings = embeddings.get_vectors(num_embeddings, configuration=config) # Build Tensorflow Graph tag_graph = tf.Graph() with tag_graph.as_default(): # Build Seq2Tag Model # Create model and load parameters. tagger = model.SRLModel(config, vector_embeddings, config.label_dict.size()) tagger.build() # Start Tensorflow Session tag_sess = tf.Session(graph=tag_graph) with tag_sess.as_default(): with tag_graph.as_default(): tf.global_variables_initializer().run() tags = get_labels(sentence=question, session=tag_sess, tagger=tagger, configuration=config) return tags
def main(): ''' read arguments from the command line and initiate the training. ''' parser = argparse.ArgumentParser( description='Train an LSTM sentence-pair classifier.') parser.add_argument( '-d', '--data-folder', required=True, help='the folder containing the train, test, dev sets.') parser.add_argument('-s', '--source-ext', required=False, default='src', help='the extension of the source files.') parser.add_argument('-t', '--target-ext', required=False, default='mt', help='the extension of the target files.') parser.add_argument('-l', '--labels-ext', required=False, default='hter', help='the extension of the labels files.') parser.add_argument('-b', '--batch-size', required=False, default=64, help='the batch size.') parser.add_argument('-a', '--attention-type', required=False, default=None, help='the attention type: \'dot\', \'rte\', \'None\'.') parser.add_argument('-m', '--model-folder', required=False, default='models', help='the directory to save the models') parser.add_argument('-g', '--gpuid', required=False, default=-1, help='the ID of the GPU to use.') args = parser.parse_args() source_train_filename = os.path.join(os.path.realpath(args.data_folder), 'train.' + args.source_ext) target_train_filename = os.path.join(os.path.realpath(args.data_folder), 'train.' + args.target_ext) source_dev_filename = os.path.join(os.path.realpath(args.data_folder), 'dev.' + args.source_ext) target_dev_filename = os.path.join(os.path.realpath(args.data_folder), 'dev.' + args.target_ext) source_test_filename = os.path.join(os.path.realpath(args.data_folder), 'test.' + args.source_ext) target_test_filename = os.path.join(os.path.realpath(args.data_folder), 'test.' + args.target_ext) labels_train = os.path.join(os.path.realpath(args.data_folder), 'train.' + args.labels_ext) labels_dev = os.path.join(os.path.realpath(args.data_folder), 'dev.' + args.labels_ext) labels_test = os.path.join(os.path.realpath(args.data_folder), 'test.' + args.labels_ext) if not os.path.exists(labels_test): labels_test = None data_dict_file = os.path.join(os.path.realpath(args.data_folder), 'data.dict') #labels_dict_file = os.path.join(os.path.realpath(args.data_folder), 'ter.dict') dictionary_data = Dict.Dictionary(data_dict_file) dictionary_data.load_dictionary() #dictionary_labels = Dict.Labels(labels_dict_file) #dictionary_labels.load_dictionary() train_data = DP.DataLD(source_train_filename, target_train_filename, labels_train, dictionary_data) #, dictionary_labels) dev_data = DP.DataLD(source_dev_filename, target_dev_filename, labels_dev, dictionary_data) #, dictionary_labels) test_data = DP.DataLD(source_test_filename, target_test_filename, labels_test, dictionary_data) #, dictionary_labels) device = torch.device("cuda:" + str(args.gpuid) if torch.cuda.is_available() and int(args.gpuid) > -1 else "cpu") train(train_data, dev_data, test_data, int(args.batch_size), device, args.attention_type, args.model_folder)
def pre_processing(self): processed_questions = [] # Build an empty dictionary for question types question_type_dict = dict.Dictionary() # Build an empty dictionary for answer types answer_type_dict = dict.Dictionary() # Build an empty dictionary for table names table_dict = dict.Dictionary() for item in self.questions: # Extract the id of the question in the dataset question_id = item[0] # Extract the questions from loaded inputs and convert to lower case question = item[1] question = question.lower() # Extract the table name lower case # and add table name to dictionary table_name = item[2] table_name = table_name.lower() table_dict.add(table_name) # Extract the question type from loaded inputs and convert to lower case # and add question type to dictionary question_type = item[3] question_type = question_type.lower() question_type_dict.add(question_type) # Extract the answer type from loaded inputs and convert to lower case # and add answer type to dictionary answer_type = item[4] answer_type = answer_type.lower() answer_type_dict.add(answer_type) # Extract the target labels from loaded inputs and convert to lower case semantic_labels = item[5] semantic_labels = semantic_labels.lower() # Tokenize Questions and Labels question_tokens, semantic_label_tokens = self.token_generator( question, semantic_labels) # Receive POS Tag List for tokens pos = self.pos_generator(question_tokens) # Get list of unique labels # We do not use a dictionary because "<UNK>", "o", "<s>", "</s>" would automatically be added when initializing the dict for token in semantic_label_tokens: if not self.config.semantic_labels.__contains__(token): self.config.semantic_labels.append(token) if len(question_tokens) != len(semantic_label_tokens): print("Error: Question", str(question_id), "has different lengths!") # Create Question Object q = Question(question_id, question, question_tokens, pos, "ner", table_name, question_type, answer_type, semantic_label_tokens, None) processed_questions.append(q) # print(q.id, q.tokens, q.semantic_labels) # Call the method to find the longest sequence self.max_sequence_length(processed_questions) self.config.table_name_dict = table_dict self.config.question_types_dict = question_type_dict self.config.answer_types_dict = answer_type_dict return processed_questions
def text_pre_processing(self): processed_questions = [] # Build an empty dictionary for question types question_type_dict = dict.Dictionary() # Build an empty dictionary for answer types answer_type_dict = dict.Dictionary() for item in self.questions: # Extract the id of the question in the dataset question_id = item[0] # Extract the questions from loaded inputs and convert to lower case question = item[1] question = question.lower() # Extract the question type from loaded inputs and convert to lower case # and add question type to dictionary question_type = item[2] question_type = question_type.lower() question_type_dict.add(question_type) # Extract the answer type from loaded inputs and convert to lower case # and add answer type to dictionary answer_type = item[3] answer_type = answer_type.lower() answer_type_dict.add(answer_type) # Extract the target labels from loaded inputs and convert to lower case semantic_labels = item[4] semantic_labels = semantic_labels.lower() # Tokenize Questions and Labels question_tokens, semantic_label_tokens = self.token_generator( question, semantic_labels) # Receive POS Tag List for tokens pos = self.pos_generator(question_tokens) # Get list of unique labels for token in semantic_label_tokens: if not self.config.semantic_labels.__contains__(token): self.config.semantic_labels.append(token) # Create Question Object q = Question(question_id, question, question_tokens, pos, "ner", question_type, answer_type, semantic_label_tokens) processed_questions.append(q) # print(q.id) # print(q.question) # print(q.question_type) # print(q.answer_type) # print() # Call the method to find the longest sequence self.max_sequence_length(processed_questions) self.config.question_types_dict = question_type_dict self.config.answer_types_dict = answer_type_dict return processed_questions