Ejemplo n.º 1
0
def build_words(questions, configuration):

    config = configuration

    # Build an empty dictionary for words
    word_dict = dict.Dictionary()
    word_dict.initialize_unknown_token()

    # Placeholder for returned dataset
    x = []

    for question in questions:

        # Temporary row
        row = []

        for token in question.tokens:
            # First add "new" word to dict
            token_id = word_dict.add(token)

            # Add token_id to the data
            row.append(token_id)

        # If padding is enabled - fill sequence with unknown tokens
        if config.use_word_padding:
            while len(row) < config.max_sequence_length:
                row.append(word_dict.get_index(constants.unknown_token))

        # Add new row to the data
        x.append(row)

    # Transform x to numpy array
    x = np.array(x)

    # Set configuration value for word_dict
    config.word_dict = word_dict

    config.word_dict.add("competing")
    config.word_dict.add("compete")
    config.word_dict.add("activities")
    config.word_dict.add("activity")
    config.word_dict.add("year’s")
    config.word_dict.add("r&d")
    config.word_dict.add("t&e")
    config.word_dict.add("get")
    config.word_dict.add("located")
    config.word_dict.add("segment")
    config.word_dict.add("minimum")
    config.word_dict.add("maximum")

    return x
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        description='Clasifying a sentence with LSTM.')
    parser.add_argument('-m',
                        '--model',
                        required=True,
                        help='the path to the model file.')
    parser.add_argument('-s',
                        '--source-sentence',
                        required=True,
                        help='the source sentence to classify')
    parser.add_argument('-t',
                        '--target-sentence',
                        required=True,
                        help='the target sentence to classify')
    parser.add_argument('-d',
                        '--dictionary',
                        required=True,
                        help='the dictionary/vocabulary.')
    parser.add_argument('-g',
                        '--gpuid',
                        required=False,
                        default=-1,
                        help='the ID of the GPU to use.')

    args = parser.parse_args()

    deviceid = -1
    if int(args.gpuid) > -1 and torch.cuda.is_available():
        deviceid = int(args.gpuid)
        print('Using GPU ' + str(deviceid))
        torch.cuda.set_device(deviceid)

    if os.path.exists(args.dictionary):
        dictionary = Dict.Dictionary(os.path.realpath(args.dictionary))
        dictionary.load_dictionary()
        data_pr = DataPreprocessor(dictionary)
    else:
        print("Path not found: ", args.dictionary)
        exit(1)

    if not os.path.exists(args.model):
        print("Path not found: ", args.model)
        exit(1)

    print(
        test(args.source_sentence, args.target_sentence, data_pr, args.model,
             deviceid))
Ejemplo n.º 3
0
def build_types(questions, type, configuration):

    config = configuration

    y = []

    # Build an empty dictionary for words
    label_dict = dict.Dictionary()

    print("label_dict")
    print(label_dict.idx2str)

    for question in questions:

        if type == "cnn_question":
            # Fill dictionary with values
            label_dict.add(question.question_type)
        else:
            # Fill dictionary with values
            label_dict.add(question.answer_type)

    config.label_dict = label_dict

    # Build label data
    for question in questions:

        row = []
        for label in config.label_dict.idx2str:
            if type == "cnn_question":
                if label == question.question_type:
                    row.append(1)
                else:
                    row.append(0)
            else:
                if label == question.answer_type:
                    row.append(1)
                else:
                    row.append(0)

        y.append(row)

    y = np.array(y)

    return y
Ejemplo n.º 4
0
def build_words(questions, configuration):

    config = configuration

    # Build an empty dictionary for words
    word_dict = dict.Dictionary()

    print("Dict")
    print(word_dict.idx2str)

    # Placeholder for returned dataset
    x = []

    for question in questions:

        # Temporary row
        row = []

        for token in question.tokens:
            # First add "new" word to dict
            token_id = word_dict.add(token)

            # Add token_id to the data
            row.append(token_id)

        # If padding is enabled - fill sequence with unknown tokens
        if config.use_word_padding:
            while len(row) < config.max_sequence_length:
                row.append(word_dict.get_index(constants.unknown_token))

        # Add new row to the data
        x.append(row)

    # Transform x to numpy array
    x = np.array(x)

    # Set configuration value for word_dict
    config.word_dict = word_dict

    return x
Ejemplo n.º 5
0
def build_labels(questions, configuration):

    config = configuration

    # Build an empty dictionary for words
    label_dict = dict.Dictionary()

    print("label_dict")
    print(label_dict.idx2str)

    # Add unknown label marker if label padding is enabled
    if config.use_label_padding:
        label_dict.add(constants.unknown_label)

    # Placeholder for returned labels
    y = []

    for question in questions:

        # Temporary row
        row = []

        for label in question.semantic_labels:
            # First add "new" word to dict
            label_id = label_dict.add(label)

            # Add token_id to the data
            row.append(label_id)

        # Add new row to the data
        y.append(row)

    # Transform y to numpy array
    y = np.array(y)

    # Set configuration value for word_dict
    config.label_dict = label_dict

    return y
Ejemplo n.º 6
0
 def __init__(self, words_filename, min_word_len, board_size):
     self.dictionary = dictionary.Dictionary(words_filename, min_word_len,
                                             MAX_WORD_LEN)
     self.size = board_size
     self.gen_board()
Ejemplo n.º 7
0
def rest_api_call(question):

    config = Variables()
    constant = constants

    words = [
        '<UNK>', 'how', 'is', 'our', 'financial', 'performance', 'versus',
        'analyst', 'estimates', 'which', 'market', 'units', 'generate', 'the',
        'most', 'cloud', 'revenue', 'have', 'had', 'highest', 'growth',
        'rates', 'region', 'generates', 'countries', 'generated', 'service',
        'country', 'has', 'in', 'past', 'half', 'year', 'unit', 'least',
        'organization', 'last', 'quarter', 'location', 'for', 'locations',
        'greater', 'than', 'ten', 'million', 'show', 'all', 'that', 'profits',
        'bigger', '1', 'are', 'top', 'regarding', 'many', 'more', '10',
        'percent', 'software', '5', 'on-premise', 'was', 'what', 'two',
        'quarters', 'during', 'us', 'development', 'of', 'throughout', 'three',
        'margin', 'product', 'objects', 'products', 'rolling', 'four',
        'germany', 'rate', 'compared', 'to', 'oracle', 'tell', 'me', 'key',
        'driver', 'who', 'does', 'cost', 'structure', 'united', 'states',
        'look', 'like', 'travel', 'and', 'entertainment', 'expenses', 'much',
        'license', 'over', 'hana', 'per', 'quarterly', 'services', 'offerings',
        'total', 'profit', 'first', 'north', 'america', 'this', 'years',
        'trend', 'apj', 'evolved', 'biggest', 'based', 'on', 'with', 'emea',
        'share', 'decreasing', 'digits', 'compare', 'gross', 'between',
        'france', 'italy', 'categories', 'marketing', 'costs', 'differences',
        'austria', 'calculate', 'each', 'stake', '2016', 'display', 'high',
        'percentage', 'difference', 'influencer', 'operating', 'dach',
        'lowest', 'earned', 'across', 'company', 'q1', 'worst', 'bottom', '3',
        'relation', 'impact', 'looks', 'traffic', 'same', 'indonesia',
        'revenues', 'higher', 'figures', 'one', 'billion', 'dollars', 'great',
        'britain', 'china', 'comparison', 'regions', 'americas', 'double',
        'digit', 'do', 'not', 'achieved', 'rising', 'increasing', 'poland',
        'number', 'a', 'lower', 'whole', 'sap', 'saudi', 'arabia',
        'profitability', 'japan', 'spend', 'croatia', 'research', 'denmark',
        'latin', 'annually', 'hardware', 'monthly', 'run', '4', 'spain',
        'india', 'europe', 'strongest', 'split', 'by', 'sector', 'profitable',
        'business', 'margins', 'switzerland', 'belonging', 'south', 'numbers',
        'runs', 'were', 'best', 'performing', 'their', 'gains', '2017',
        'distribution', 'africa', 'stagnating', 'month', '2', 'organizations',
        'having', 'positive', 'every', 's4', '2015', '2014', 'combined', 'q2',
        'caused', 'erp', 'usa', 'canada', 'q3', 'q4', 'five', 'deals',
        'pipeline', 'facebook', 'dollar', 'sales', 'declining', 'salesperson',
        'closed', 'salespersons', 'salesmen', 'forecast', 'employee', 'sale',
        'michael', 'scott', 'employees', 'fulfilled', 'quotas', 'quota',
        'goal', 'did', 'reach', 'brazil', 'current', 'quote', 'close', 'ratio',
        'target', 'opportunities', 'currently', 'bookings', 'average', 'deal',
        'customers', 'we', 'companies', 'consider', 'buy', 'boardroom', 'want',
        'thinks', 'about', 'buying', 'missed', 'next', 'volume', 'bought',
        'new', 'won', 'as', 'well', 'ariba', 'concur', 'amount', 'salesman',
        'sold', 'licences', 'often', 'months', 'digital', 'r3', 'led',
        'contracts', 'resulted', 'end', 'fourth', 'since', 'licenses',
        'within', 'selling', 'argentina', 'now', 'john', 'schneider', 'six',
        'contract', 'volkswagen', 'sealed', '80', 'values', 'resigned', 'from',
        '6', 'customer', 'loyalty', 'nestle', 'volumes', '250', 'person',
        'russia', 'leads', 'turned', 'into', 'sweden', 'items', 'opportunity',
        'stage', 'booking', 'weakest', 'related', 'holds', 'promises',
        'promising', 'iot', 'mexico', 'asia', 'keeps', 'his', 'promise',
        'keep', 'analytics', 'analytic', 'conversion', 'responsible',
        'thyssen', 'krupp', 'account', 'executive', 'persons', 'lost',
        'booked', 'starting', 'its', 'working', 'employed', 'open', 'or',
        'representatives', 'norway', 'name', 'representative', 'todd',
        'packer', 'hr', 'successfactors', 'audi', 'australia', 'porsche',
        'peru', 'worldwide', 'seven', 'names', 'fifty', 'an', 'at&t',
        'verizon', 'samsung', 'headcount', 'people', 'israel', 'fte', 'palo',
        'alto', 'department', 'developed', 'at', 'fully', 'loaded', 'plan',
        '2019', 'satisfaction', 'happiest', 'below', '95', 'dwight', 'schrute',
        'managing', 'managed', 'wage', 'managers', 'salary', 'position',
        'level', 'newtown', 'square', 'early', 'talents', 'walldorf',
        'learning', 'completions', 'online', 'courses', 'completed', 'hired',
        'fluctuation', 'thousand', 'women', 'management', 'positions',
        'leadership', 'manager', 'leader', 'men', 'managerial',
        'responsibility', 'tasks', 'growing', 'strong', 'fewest', 'external',
        'workforce', 'consulting', 'non', 'billable', 'fastest', 'job', 'role',
        'diversity', 'gender', 'among', 'workers', 'hirings', 'so', 'far',
        'joined', 'salaries', 'incomes', 'students', 'bangalore', 'seattle',
        'paris', '100', 'less', 'beijing', 'finance', 'administration',
        'innovation', 'human', 'resources', 'leaders', 'leading', 'frequent',
        'hold', 'grouped', 'age', 'movement', 'bill', 'mcdermott', 'greece',
        'subordinates', 'finished', 'jimmy', 'kimmel', 'andy', 'bernard',
        'where', 'departments', 'available', 'developer', 'vacant',
        'advertisement', 'earn', 'money', 'lot', 'christian', 'klein', 'luka',
        'mueller', 'head', 'been', 'signed', 'unlimited', 'conditions', 'mee',
        'possibilities', 'training', 'sessions', 'offered', 'nordics',
        'singapore', 'none', '50000', 'older', '30', '50', 'l3', 'threatening',
        'competitors', 'competitor', 'threats', 'main', 'ibm', 'microsoft',
        'also', 'facing', 'gaining', 'momentum', 'against', 'netsuite',
        'fiercest', 'workday', 'salesforce', 'in-memory', 'database',
        'investments', 'earnings', 'budget', 'list', 'dangerous', 'carries',
        'invested', 'boosting', 'better', 'stock', 'price', 'evaluation',
        'index', 'exchange', 'google', 'january', 'december', 'may', 'right',
        'doing', 'prices', 'introduction', '8', 'value', 'capitalization',
        'today', 'increased', 'saps', 'latest', 'ibms', 'indices', 'apple',
        'evaluate', 'mercedes', 'competing', 'compete', 'activities',
        'activity', 'year’s', 'r&d', 't&e', 'get', 'located', 'segment',
        'minimum', 'maximum'
    ]
    labels = [
        'o', 'mea', 'cmp', 'res', 'argm', 'b_where', 'i_where', 'oper', 'grpby'
    ]

    # Build an empty dictionary for words and labels
    word_dict = dict.Dictionary()
    word_dict.initialize_unknown_token()

    label_dict = dict.Dictionary()

    for word in words:
        word_dict.add(word)

    config.word_dict = word_dict

    for label in labels:
        label_dict.add(label)

    config.label_dict = label_dict

    # Load word embeddings that map words in some language to high-dimensional vectors
    emb = embeddings.load(path=constant.data_folder,
                          file=config.embedding_data,
                          configuration=config)

    # Transpose embedding keys to corresponding keys in dictionary
    num_embeddings = embeddings.transpose(embeddings=emb, configuration=config)

    # Get only numeric vectors of embeddings
    vector_embeddings = embeddings.get_vectors(num_embeddings,
                                               configuration=config)

    # Build Tensorflow Graph
    tag_graph = tf.Graph()
    with tag_graph.as_default():
        # Build Seq2Tag Model
        # Create model and load parameters.
        tagger = model.SRLModel(config, vector_embeddings,
                                config.label_dict.size())
        tagger.build()

    # Start Tensorflow Session
    tag_sess = tf.Session(graph=tag_graph)
    with tag_sess.as_default():
        with tag_graph.as_default():
            tf.global_variables_initializer().run()

            tags = get_labels(sentence=question,
                              session=tag_sess,
                              tagger=tagger,
                              configuration=config)

    return tags
Ejemplo n.º 8
0
def main():
    ''' read arguments from the command line and initiate the training.
    '''

    parser = argparse.ArgumentParser(
        description='Train an LSTM sentence-pair classifier.')
    parser.add_argument(
        '-d',
        '--data-folder',
        required=True,
        help='the folder containing the train, test, dev sets.')
    parser.add_argument('-s',
                        '--source-ext',
                        required=False,
                        default='src',
                        help='the extension of the source files.')
    parser.add_argument('-t',
                        '--target-ext',
                        required=False,
                        default='mt',
                        help='the extension of the target files.')
    parser.add_argument('-l',
                        '--labels-ext',
                        required=False,
                        default='hter',
                        help='the extension of the labels files.')
    parser.add_argument('-b',
                        '--batch-size',
                        required=False,
                        default=64,
                        help='the batch size.')
    parser.add_argument('-a',
                        '--attention-type',
                        required=False,
                        default=None,
                        help='the attention type: \'dot\', \'rte\', \'None\'.')
    parser.add_argument('-m',
                        '--model-folder',
                        required=False,
                        default='models',
                        help='the directory to save the models')
    parser.add_argument('-g',
                        '--gpuid',
                        required=False,
                        default=-1,
                        help='the ID of the GPU to use.')

    args = parser.parse_args()

    source_train_filename = os.path.join(os.path.realpath(args.data_folder),
                                         'train.' + args.source_ext)
    target_train_filename = os.path.join(os.path.realpath(args.data_folder),
                                         'train.' + args.target_ext)
    source_dev_filename = os.path.join(os.path.realpath(args.data_folder),
                                       'dev.' + args.source_ext)
    target_dev_filename = os.path.join(os.path.realpath(args.data_folder),
                                       'dev.' + args.target_ext)
    source_test_filename = os.path.join(os.path.realpath(args.data_folder),
                                        'test.' + args.source_ext)
    target_test_filename = os.path.join(os.path.realpath(args.data_folder),
                                        'test.' + args.target_ext)

    labels_train = os.path.join(os.path.realpath(args.data_folder),
                                'train.' + args.labels_ext)
    labels_dev = os.path.join(os.path.realpath(args.data_folder),
                              'dev.' + args.labels_ext)

    labels_test = os.path.join(os.path.realpath(args.data_folder),
                               'test.' + args.labels_ext)
    if not os.path.exists(labels_test):
        labels_test = None

    data_dict_file = os.path.join(os.path.realpath(args.data_folder),
                                  'data.dict')
    #labels_dict_file = os.path.join(os.path.realpath(args.data_folder), 'ter.dict')
    dictionary_data = Dict.Dictionary(data_dict_file)
    dictionary_data.load_dictionary()
    #dictionary_labels = Dict.Labels(labels_dict_file)
    #dictionary_labels.load_dictionary()

    train_data = DP.DataLD(source_train_filename, target_train_filename,
                           labels_train,
                           dictionary_data)  #, dictionary_labels)
    dev_data = DP.DataLD(source_dev_filename, target_dev_filename, labels_dev,
                         dictionary_data)  #, dictionary_labels)
    test_data = DP.DataLD(source_test_filename, target_test_filename,
                          labels_test, dictionary_data)  #, dictionary_labels)

    device = torch.device("cuda:" +
                          str(args.gpuid) if torch.cuda.is_available()
                          and int(args.gpuid) > -1 else "cpu")

    train(train_data, dev_data, test_data, int(args.batch_size), device,
          args.attention_type, args.model_folder)
Ejemplo n.º 9
0
    def pre_processing(self):

        processed_questions = []

        # Build an empty dictionary for question types
        question_type_dict = dict.Dictionary()

        # Build an empty dictionary for answer types
        answer_type_dict = dict.Dictionary()

        # Build an empty dictionary for table names
        table_dict = dict.Dictionary()

        for item in self.questions:

            # Extract the id of the question in the dataset
            question_id = item[0]

            # Extract the questions from loaded inputs and convert to lower case
            question = item[1]
            question = question.lower()

            # Extract the table name lower case
            # and add table name to dictionary
            table_name = item[2]
            table_name = table_name.lower()
            table_dict.add(table_name)

            # Extract the question type from loaded inputs and convert to lower case
            # and add question type to dictionary
            question_type = item[3]
            question_type = question_type.lower()
            question_type_dict.add(question_type)

            # Extract the answer type from loaded inputs and convert to lower case
            # and add answer type to dictionary
            answer_type = item[4]
            answer_type = answer_type.lower()
            answer_type_dict.add(answer_type)

            # Extract the target labels from loaded inputs and convert to lower case
            semantic_labels = item[5]
            semantic_labels = semantic_labels.lower()

            # Tokenize Questions and Labels
            question_tokens, semantic_label_tokens = self.token_generator(
                question, semantic_labels)

            # Receive POS Tag List for tokens
            pos = self.pos_generator(question_tokens)

            # Get list of unique labels
            # We do not use a dictionary because "<UNK>", "o", "<s>", "</s>" would automatically be added when initializing the dict
            for token in semantic_label_tokens:
                if not self.config.semantic_labels.__contains__(token):
                    self.config.semantic_labels.append(token)

            if len(question_tokens) != len(semantic_label_tokens):
                print("Error: Question", str(question_id),
                      "has different lengths!")

            # Create Question Object
            q = Question(question_id, question, question_tokens, pos, "ner",
                         table_name, question_type, answer_type,
                         semantic_label_tokens, None)
            processed_questions.append(q)

            # print(q.id, q.tokens, q.semantic_labels)

        # Call the method to find the longest sequence
        self.max_sequence_length(processed_questions)

        self.config.table_name_dict = table_dict
        self.config.question_types_dict = question_type_dict
        self.config.answer_types_dict = answer_type_dict

        return processed_questions
Ejemplo n.º 10
0
    def text_pre_processing(self):

        processed_questions = []

        # Build an empty dictionary for question types
        question_type_dict = dict.Dictionary()

        # Build an empty dictionary for answer types
        answer_type_dict = dict.Dictionary()

        for item in self.questions:

            # Extract the id of the question in the dataset
            question_id = item[0]

            # Extract the questions from loaded inputs and convert to lower case
            question = item[1]
            question = question.lower()

            # Extract the question type from loaded inputs and convert to lower case
            # and add question type to dictionary
            question_type = item[2]
            question_type = question_type.lower()
            question_type_dict.add(question_type)

            # Extract the answer type from loaded inputs and convert to lower case
            # and add answer type to dictionary
            answer_type = item[3]
            answer_type = answer_type.lower()
            answer_type_dict.add(answer_type)

            # Extract the target labels from loaded inputs and convert to lower case
            semantic_labels = item[4]
            semantic_labels = semantic_labels.lower()

            # Tokenize Questions and Labels
            question_tokens, semantic_label_tokens = self.token_generator(
                question, semantic_labels)

            # Receive POS Tag List for tokens
            pos = self.pos_generator(question_tokens)

            # Get list of unique labels
            for token in semantic_label_tokens:
                if not self.config.semantic_labels.__contains__(token):
                    self.config.semantic_labels.append(token)

            # Create Question Object
            q = Question(question_id, question, question_tokens, pos, "ner",
                         question_type, answer_type, semantic_label_tokens)
            processed_questions.append(q)

            # print(q.id)
            # print(q.question)
            # print(q.question_type)
            # print(q.answer_type)
            # print()

        # Call the method to find the longest sequence
        self.max_sequence_length(processed_questions)

        self.config.question_types_dict = question_type_dict
        self.config.answer_types_dict = answer_type_dict

        return processed_questions