def build_ngram_unigram_dictionaries(transformed_token_list,
                                     correct_token_list, incorrect_token_list,
                                     ngram_to_unigram_dictionary,
                                     unigram_to_ngram_dictionary):

    ngram_tuples = get_frequent_ngrams(transformed_token_list,
                                       p.ngram_occurence_freq)

    for ngram in ngram_tuples:
        merged_ngram = ""
        split_ngram_string = ""
        dash_separated_string = ""
        for t in list(ngram):
            merged_ngram = (merged_ngram + t).strip()
            split_ngram_string = (split_ngram_string + " " + t).strip()
            dash_separated_string = (dash_separated_string + "-" + t).strip()

        # check if the ngram forms a unigram when merged
        if merged_ngram in correct_token_list:
            ngram_to_unigram_dictionary.append([ngram, merged_ngram])
        elif merged_ngram in incorrect_token_list:
            unigram_to_ngram_dictionary.append(
                [merged_ngram, split_ngram_string, dash_separated_string])

    # print to dictionary files
    print_to_file(v.ngram_to_unigram_dictionary_path,
                  ngram_to_unigram_dictionary, v.ngram_to_unigram_headings)
    print_to_file(v.unigram_to_ngram_dictionary_path,
                  unigram_to_ngram_dictionary, v.unigram_to_ngram_headings)
def semantic_transformation(data_path, sheet_name, columns, short_text_name):
    """Prints corpus with semantic transformation applied"""

    wo_data = pd.read_excel(data_path, sheet_name=sheet_name)
    selected_wo_data = pd.DataFrame(wo_data, columns=["ShortText"])
    short_text_list = selected_wo_data[short_text_name]  # just get short text

    # Step 1: Tokenization
    # Generates a token list with punctuation removed
    transformed_text_list = []

    for short_text in short_text_list:
        tokenized = tokenization(short_text)
        new_text = ''
        for token in tokenized:
            new_text += token.lower()
            new_text += ' '

        # Step 2: Semantic Transformation
        # Generates a token list transformed against regex matches
        transformed_text = semantic_transform(new_text)
        transformed_text_list.append(transformed_text)

    # Write output to file
    print_to_file(v.transformed_text_path_stage_1, transformed_text_list,
                  v.transformed_text_heading)
コード例 #3
0
def lemmatisation():

    # open preprocessed tokens
    wo_data = pd.read_excel(v.input_file_path_lemmatisation,
                            sheet_name=v.input_file_sheet_name)
    selected_wo_data = pd.DataFrame(wo_data, columns=v.input_file_columns)
    transformed_token_list = list(selected_wo_data[v.input_file_column])

    # create list of tokens
    token_list = []
    for sentence in transformed_token_list:
        tokens = sentence.split(' ')
        for token in tokens:
            token_list.append(token)

    token_set = list(set(token_list))
    final_sentences = []
    for sentence in transformed_token_list:
        tokens = sentence.split(' ')
        final_tokens = []
        for w in tokens:
            final_word = w
            lemmatized_word = Lem.lemmatize(w)
            if len(w) > 3 and lemmatized_word != w:
                if lemmatized_word in tokens:
                    final_word = lemmatized_word
                elif len(w) > 4 and w[-1] == 's' and w[:-1] in token_set:
                    final_word = lemmatized_word
            final_tokens.append(final_word)
        final_sentences.append(' '.join(final_tokens))
    print_to_file(v.transformed_text_path_stage_3, final_sentences,
                  v.transformed_text_heading)
def abbreviation_correction():

    # open preprocessed tokens
    wo_data = pd.read_excel(v.input_file_path_abbreviation,
                            sheet_name=v.input_file_sheet_name)
    selected_wo_data = pd.DataFrame(wo_data, columns=v.input_file_columns)
    transformed_token_list = list(selected_wo_data[v.input_file_column])

    known_abbreviation_list = [['lh', 'left-hand'], ['rh', 'right-hand'],
                               ['flh', 'front-left-hand'],
                               ['rlh', 'rear-left-hand'],
                               ['frh', 'front-right-hand'],
                               ['rrh', 'rear-right-hand'],
                               ['rr', 'rear-right'], ['rl', 'rear-left'],
                               ['fr', 'front-right'], ['fl', 'front-left'],
                               ['rhs', 'right-hand-side'],
                               ['lhs', 'left-hand-side'], ['hr', 'hour'],
                               ['wk', 'week']]

    known_trigram_abbreviation_list = [
        ['u', '_', 's', 'unservicable'],
        ['c', '_', 'o', 'changeout'],
        ['d', '_', 's', 'drivers'],
        ['a', '_', 'c', 'air conditioning'],
        ['l', '_', 'h', 'left-hand'],
        ['r', '_', 'h', 'right-hand'],
    ]

    final_sentences = []
    for sentence in transformed_token_list:
        tokens = sentence.split(' ')
        final_tokens = []

        flag = 0
        i = 0

        for index, token in enumerate(tokens):
            final_token = token

            # check if in abbreviation list
            for abbrev, full in known_abbreviation_list:
                if abbrev == token:
                    final_token = full

            # check if in trigram list
            for trigram in known_trigram_abbreviation_list:
                if (index != 0 and index != (len(tokens) - 1)
                        and tokens[index - 1] == trigram[0]
                        and tokens[index] == trigram[1]
                        and tokens[index + 1] == trigram[2]):
                    final_token = trigram[3]
                    flag = 1
                    i = index
            final_tokens.append(final_token)
        if (flag == 1):
            final_tokens.pop(i + 1)
            final_tokens.pop(i - 1)
        final_sentences.append(' '.join(final_tokens))
    print_to_file(v.transformed_text_path_stage_4, final_sentences,
                  v.transformed_text_heading)
def baseline_tagging(transformed_text_list):

    tagged_records = [];
    for sentence in transformed_text_list:

        s = sentence.strip().split(" ")

        flag = False
        while flag is False:
            if '' in s:
                s.remove('')
            if '' not in s:
                flag = True

        # regular expression used for noun phrase chunking
        grammar = "NP: {<NN.*>*}"
        cp = nltk.RegexpParser(grammar)

        # tagged_s is a list of tuples consisting of the word and its pos tag
        if '+' not in s:
            tagged_s = nltk.pos_tag(s)
            for c, word_pos_tag_tuple in enumerate(tagged_s):
                word, pos_tag = word_pos_tag_tuple
                # only searching for the original verb
                if 'VB' in pos_tag:
                    s[c] = word + '='
                elif 'JJ' in pos_tag:
                    s[c] = word + '#'

            #noun phrase chunking for items detection
            result = cp.parse(tagged_s)
            for subtree in result.subtrees():
                if subtree.label() == 'NP':
                    t = subtree
                    noun_phrase_chunk = ' '.join(word for word, pos in t.leaves())
                    tagged_noun_phrase_chunk = '~'.join(word for word, pos in t.leaves())
                    starting_index_noun_phrase_chunk = position_of_ngram(tuple(noun_phrase_chunk.split()), s)
                    s[starting_index_noun_phrase_chunk] = tagged_noun_phrase_chunk
                    for i in range(1, len(t.leaves())):
                        s[starting_index_noun_phrase_chunk + i] = ''

            s = [x for x in s if x]
            string_to_print = ' '.join(s)
            tagged_records.append(string_to_print)
        else:
            string_to_print = ' '.join(s)
            tagged_records.append(string_to_print)

    print_to_file(v.baseline_output_path, tagged_records, v.output_headings)
コード例 #6
0
def spelling_correction():
    """Performs spelling correction on transformed token list"""

    # open preprocessed tokens
    wo_data = pd.read_excel(v.input_file_path_spelling_correction,
                            sheet_name=v.input_file_sheet_name)
    selected_wo_data = pd.DataFrame(wo_data, columns=v.input_file_columns)
    transformed_token_list = list(selected_wo_data[v.input_file_column])

    transformed_stage_1 = stage_1(transformed_token_list)
    transformed_stage_2 = stage_2(transformed_stage_1)
    transformed_stage_3 = stage_3(transformed_stage_2)

    print_to_file(v.transformed_text_path_stage_2, transformed_stage_3,
                  v.transformed_text_heading)
コード例 #7
0
def stage_2(transformed_token_list):
    """Check tokens afaint unigram to ngram dictionary"""
    dict_data = pd.read_excel(v.stage_2_input_path,
                              sheet_name=v.input_file_sheet_name)
    selected_correct_token_data = pd.DataFrame(
        dict_data, columns=v.stage_2_input_file_columns)
    transformed_stage_2 = []
    for sentence in transformed_token_list:
        for row in selected_correct_token_data.itertuples():
            unigram = row.unigram.strip()
            if unigram in sentence.split(' '):
                sentence = sentence.replace(unigram, row.ngram)
        transformed_stage_2.append(sentence)
    print_to_file(v.stage_2_output_path, transformed_stage_2,
                  v.input_file_columns)
    return transformed_stage_2
def build_correct_incorrect_token_dictionaries(transformed_token_list,
                                               correct_token_list,
                                               incorrect_token_list):

    for transformed_sentence in transformed_token_list:
        tokens = transformed_sentence.strip().split(' ')
        if " " in tokens:
            tokens.remove(" ")
        correct_token_list.extend(spell.known(tokens))
        incorrect_token_list.extend(spell.unknown(tokens))

    # print correct and incorrect token lists to file
    print_to_file(v.correct_token_dictionary_path,
                  list(set(correct_token_list)), v.correct_token_heading)
    print_to_file(v.incorrect_token_dictionary_path,
                  list(set(incorrect_token_list)), v.incorrect_token_heading)
def stage_1():
    '''Filter out all ngrams that contain a symptom/state, maint activity or stopwords'''
    filtered_ngrams = []

    # Get frequent ngrams
    ngram_data = pd.read_excel(v.all_tagged_frequent_ngrams_path, sheet_name=v.input_file_sheet_name)


    frequent_ngram_data = pd.DataFrame(ngram_data, columns=v.ngrams_headings)

    # Filter ngrams
    stop_words = stopwords.words('english')
    stop_words = stop_words + ['right', 'left', 'front', 'rear', 'top', 'bottom', 'right-hand', 'left-hand', 'hand',
                               'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'hour', 'day', 'week', 'month', 'year']

    # get incorrect token list
    dict_data = pd.read_excel(v.incorrect_token_dictionary_path, sheet_name=v.input_file_sheet_name)
    incorrect_token_data = pd.DataFrame(dict_data, columns=v.incorrect_token_heading)
    incorrect_token_list = list(incorrect_token_data[v.incorrect_token_heading[0]])

    for index, row in frequent_ngram_data.iterrows():
        row['headword'] = get_proper_string(row['headword'])
        row['tailword1'] = get_proper_string(row['tailword1'])
        row['tailword2'] = get_proper_string(row['tailword2'])
        row['tailword3'] = get_proper_string(row['tailword3'])
        row['tailword4'] = get_proper_string(row['tailword4'])
        row['tailword5'] = get_proper_string(row['tailword5'])

        combined = (row['headword'] + ' ' + row['tailword1'] + ' ' +
                                             row['tailword2'] + ' '+row['tailword3']+
                                             ' ' +row['tailword4']+' '+row['tailword5']).strip()

        if (v.symptom_state_tag_symbol not in combined) and (v.maintenance_activity_tag_symbol not in combined):
            # if the record in not a symptom or an activity (or already tagged as a maintenance item

            combined_tokens = combined.split()
            found_flag = 0
            # check if contains "common stopwords"
            for token in combined_tokens:
                if token in stop_words or token in incorrect_token_list:
                    found_flag = 1

            if (found_flag == 0):
                filtered_ngrams.append(combined_tokens)

    print_to_file(v.maint_item_filtering_stage_1_path, filtered_ngrams, ['headword','tailword1', 'tailword2', 'tailword3', 'tailword4', 'tailword5'])
def main():
    print("Starting tagging: maintenance_item")

    preprocessed_data = pd.read_excel(v.maintenance_activity_output_path, sheet_name=v.input_file_sheet_name)
    selected_data = pd.DataFrame(preprocessed_data, columns=v.output_headings)
    transformed_text_list = list(selected_data[v.output_heading])

    ngrams = get_frequent_ngrams(transformed_text_list, p.ngram_occurence_freq)
    print_to_file(v.all_tagged_frequent_ngrams_path, ngrams, v.ngrams_headings)

    stage_1()
    print("stage 1 complete")
    if sys.argv[1] == "1": generate_word_embeddings() # long running operation
    stage_2()
    print("stage 2 complete")
    tagging(transformed_text_list)

    print("tagging: maintenance item tagging is complete")
    print('THE PROCESSING PIPELINE HAS COMPLETED SUCCESSFULLY')
コード例 #11
0
def keyword_based_ngram_filtering():
    filtered_ngrams = []
    headwords = ['cannot', 'not', 'is', 'are']
    ngram_data = pd.read_excel(v.all_frequent_ngrams_path,
                               sheet_name=v.input_file_sheet_name)
    df = pd.DataFrame(ngram_data, columns=v.ngrams_headings)
    for index, row in df.iterrows():
        if row['headword'] != '' and row['headword'] in headwords:

            filtered_ngrams.append([
                row['headword'], row['tailword1'], row['tailword2'],
                row['tailword3'], row['tailword4'], row['tailword5']
            ])
        if row['headword'] == 'to' and row['tailword1'] == 'be':
            filtered_ngrams.append([
                row['headword'], row['tailword1'], row['tailword2'],
                row['tailword3'], row['tailword4']
            ])
    print_to_file(v.symptom_state_filtered_ngrams_path, filtered_ngrams,
                  v.ngrams_headings)
コード例 #12
0
def stage_1(transformed_token_list):
    """Checks tokens against ngram to unigram dictionary"""
    dict_data = pd.read_excel(v.stage_1_input_path,
                              sheet_name=v.input_file_sheet_name)
    selected_correct_token_data = pd.DataFrame(
        dict_data, columns=v.stage_1_input_file_columns)
    transformed_state_1 = []
    for sentence in transformed_token_list:
        for row in selected_correct_token_data.itertuples():
            b = list(literal_eval(row.ngram))
            ngram = ''
            for word in b:
                ngram += (' ' + word)
            split_bigram = ngram.strip().split(' ')
            split_sentence = sentence.strip().split(' ')
            if ngram.strip() in sentence and split_bigram[
                    0] in split_sentence and split_bigram[1] in split_sentence:
                sentence = sentence.replace(ngram.strip(), row.unigram)
        transformed_state_1.append(sentence)
    print_to_file(v.stage_1_output_path, transformed_state_1,
                  v.input_file_columns)
    return transformed_state_1
コード例 #13
0
def detect_activities(transformed_text_list, dictionary_list):

    tagged_records = []

    try:
        conjugate('hello', 'inf')  # dirty fix to python 3.7 / pattern error
    except:
        pass

    for sentence in transformed_text_list:
        if type(sentence) != float:  # skip if nan?
            tokens = sentence.split(' ')
            for idx, token in enumerate(tokens):
                if v.symptom_state_tag_symbol not in token:  # if it has not already been tagged as a symptom/state
                    conjugated_current_word = conjugate(token, 'inf')
                    if conjugated_current_word in dictionary_list:
                        tokens[idx] = token + v.maintenance_activity_tag_symbol
            tagged_records.append(' '.join(tokens))
        else:
            tagged_records.append('')

    print_to_file(v.maintenance_activity_output_path, tagged_records,
                  v.output_headings)
コード例 #14
0
def tagging(transformed_text_list):
    tagged_records = []
    dictionary_data = pd.read_excel(v.symptom_state_dictionary_path,
                                    sheet_name=v.input_file_sheet_name)
    dictionary_df = pd.DataFrame(dictionary_data,
                                 columns=v.dictionary_headings)
    dictionary_list = []
    for index, row in dictionary_df.iterrows():
        dictionary_list.append(row['words'].split(' '))

    for sentence in transformed_text_list:
        tokens = sentence.strip().split(' ')
        total_ngrams = []
        for n in range(2, 7):
            total_ngrams = total_ngrams + list(ngrams(tokens, n))
        tagged, flag = tag_record(tokens, total_ngrams, dictionary_list)

        # else if single term matches then tag
        # case 1: single term from describing nouns
        # case 2: single term from dictionary
        if (flag == 0):
            for index, token in enumerate(tokens):
                for row in dictionary_list:
                    if len(row) == 1 and token == row[0] and len(token) > 3:
                        tokens[index] = token + v.symptom_state_tag_symbol

        if (flag == 1):
            joined = ''.join(w if (w.endswith(v.symptom_state_tag_symbol) and (
                i - 1 != len(tagged)
                and tagged[i].endswith(v.symptom_state_tag_symbol))) else w +
                             ' ' for i, w in enumerate(tagged)).lstrip()
            tagged_records.append(joined)
        else:
            tagged_records.append(' '.join(tokens))

    print_to_file(v.symptom_state_output_path, tagged_records,
                  v.output_headings)
コード例 #15
0
def dictionary_building():
    symptom_state_dictionary = []
    # step 1: get filtered ngrams and append to dictionary
    n_data = pd.read_excel(v.symptom_state_filtered_ngrams_path,
                           sheet_name=v.input_file_sheet_name)
    df = pd.DataFrame(n_data, columns=v.ngrams_headings)
    for index, row in df.iterrows():
        row['headword'] = get_proper_string(row['headword'])
        row['tailword1'] = get_proper_string(row['tailword1'])
        row['tailword2'] = get_proper_string(row['tailword2'])
        row['tailword3'] = get_proper_string(row['tailword3'])
        row['tailword4'] = get_proper_string(row['tailword4'])
        row['tailword5'] = get_proper_string(row['tailword5'])

        symptom_state_dictionary.append(
            (row['headword'] + ' ' + row['tailword1'] + ' ' +
             row['tailword2'] + ' ' + row['tailword3'] + ' ' +
             row['tailword4'] + ' ' + row['tailword5']).strip())

        # step 2: append tailwords to dictionary if headword is "is" or "are"
        if row['headword'] == "is" or row['headword'] == "are":
            words = (row['tailword1'] + ' ' + row['tailword2'] + ' ' +
                     row['tailword3'] + ' ' + row['tailword4'] + ' ' +
                     row['tailword5']).strip()
            symptom_state_dictionary.append(words)

    # step 3: append additional describing nouns to dictionary
    symptom_state_dictionary.append('problem')
    symptom_state_dictionary.append('error')
    symptom_state_dictionary.append('leak')
    symptom_state_dictionary.append('fault')
    symptom_state_dictionary.append('damage')
    symptom_state_dictionary.append('failure')

    print_to_file(v.symptom_state_dictionary_path, symptom_state_dictionary,
                  v.dictionary_headings)
コード例 #16
0
def ngram_detection(text_list):
    ngrams = get_frequent_ngrams(text_list, p.ngram_occurence_freq)
    print_to_file(v.all_frequent_ngrams_path, ngrams, v.ngrams_headings)
def tagging(transformed_text_list):

    tagged_records = []

    # check against fluid dicitionay
    dictionary_data = pd.read_excel(v.maint_item_filtering_stage_2_path, sheet_name=v.input_file_sheet_name)
    dictionary_df = pd.DataFrame(dictionary_data, columns=v.dictionary_headings)

    dictionary_list = []
    for index, row in dictionary_df.iterrows():
        dictionary_list.append(row['words'].split(' '))

    counter = 0;
    for sentence in transformed_text_list:
        counter = counter + 1;
        #print(counter)

        if type(sentence) != float:

            tokens = sentence.strip().split(' ')
            total_ngrams = []
            for n in range(7, 1, -1): total_ngrams = total_ngrams + list(
                ngrams(tokens, n))  ## REVERSE SO THAT LONGER N-GRAMS TAGGED FIRST
            tagged, flag = tag_record(tokens, total_ngrams, dictionary_list)

            # else if single term matches then tag
            if (flag == 0):
                for index, token in enumerate(tokens):
                    for row in dictionary_list:
                        if len(row) == 1 and token == row[0]:
                            tokens[index] = token + v.maintenance_item_tag_symbol

            if (flag == 1):
                joined = ''.join(
                    w if (w.endswith(v.maintenance_item_tag_symbol)) else w + ' ' for i, w in
                    enumerate(tagged)).lstrip()

                tagged_records.append(joined)

                print(joined)

            else:
                tagged_records.append(' '.join(tokens))
        else:
            tagged_records.append('')

    print_to_file(v.maintenance_item_tagging_1, tagged_records, v.output_headings)

    tagged_records = []

    preprocessed_data = pd.read_excel(v.maintenance_item_tagging_1, sheet_name=v.input_file_sheet_name)
    selected_data = pd.DataFrame(preprocessed_data, columns=v.output_headings)
    transformed_text_list = list(selected_data[v.output_heading])

    # check against static dictionary
    print('step 2 begin')
    dictionary_data_static = pd.read_excel(v.maint_item_static_dictionary_path, sheet_name=v.input_file_sheet_name)
    dictionary_df_static = pd.DataFrame(dictionary_data_static, columns=['words'])

    dictionary_list = []
    for index, row in dictionary_df_static.iterrows():
        dictionary_list.append(row['words'].split(' '))

    counter = 0;
    for sentence in transformed_text_list:
        counter = counter+1;
        #print(counter)

        if type(sentence) != float:

            tokens = sentence.strip().split(' ')
            total_ngrams = []
            for n in range(7, 1, -1): total_ngrams = total_ngrams + list(ngrams(tokens, n)) ## REVERSE SO THAT LONGER N-GRAMS TAGGED FIRST
            tagged, flag = tag_record(tokens, total_ngrams, dictionary_list)

            # else if single term matches then tag
            if (flag == 0):
                for index, token in enumerate(tokens):
                    for row in dictionary_list:
                        if len(row) == 1 and token == row[0]:
                            tokens[index] = token + v.maintenance_item_tag_symbol
                            #print(token)
            if (flag == 1):
                joined = ''.join(
                    w if (w.endswith(v.maintenance_item_tag_symbol)) else w + ' ' for i, w in
                    enumerate(tagged)).lstrip()
                tagged_records.append(joined)
                #print(joined)
            else:
                tagged_records.append(' '.join(tokens))
        else:
            tagged_records.append('')

    #print(tagged_records)

    print_to_file(v.maintenance_item_tagging_2, tagged_records, v.output_headings)

    # combine tags
    tagged_records = []

    print('step 3 begin')
    counter = 0;

    preprocessed_data = pd.read_excel(v.maintenance_item_tagging_2, sheet_name=v.input_file_sheet_name)
    selected_data = pd.DataFrame(preprocessed_data, columns=v.output_headings)
    transformed_text_list  = list(selected_data[v.output_heading])


    for sentence in transformed_text_list:

        counter = counter+1
        #print(counter)

        if type(sentence) != float:

            tagged_sentence = ''
            tokens = sentence.strip().split(' ')

            #print(tokens)
            i = 0
            while i <= len(tokens) - 1:
                if i != 0 and (v.maintenance_item_tag_symbol in tokens[i]) and (v.maintenance_item_tag_symbol in tokens[i-1]): # check backward
                    #print('herexx')
                    print(tokens)
                    if (tokens[i].endswith('~')):
                        tokens[i-1]=tokens[i-1]+tokens[i]
                    else:
                        tokens[i-1] = tokens[i-1] + '~' + tokens[i]
                    del tokens[i]

                    if i == len(tokens) - 1:
                        i = i + 1;

                    #print('here3');
                elif i != len(tokens) - 1 and tokens[i].endswith(v.maintenance_item_tag_symbol) and (v.maintenance_item_tag_symbol in tokens[i+1]):
                    #print('here1')
                    tokens[i] = tokens[i]+tokens[i+1]
                    del tokens[i+1]
                    i = i + 1;
                elif i != len(tokens) -1 and (v.maintenance_item_tag_symbol in tokens[i+1]) and (v.maintenance_item_tag_symbol in tokens[i]):
                    #print('here2')
                    tokens[i] = tokens[i]+'~'+tokens[i+1]
                    del tokens[i + 1]
                    #print(tokens[i])
                    i = i + 1;
                else:
                    i = i+1;

                #if (i == len(tokens)):
                #    i = i-1;
                #remove last word tag if ngram
                if i != len(tokens) and tokens[i].count('~') > 1 and tokens[i].endswith('~'):
                    tokens[i] = tokens[i][:-1]

            #(tokens)
            joined = ' '.join(tokens).lstrip()
            #print(joined)


            tagged_records.append(joined)

        else:
            tagged_records.append('+')

    #print(tagged_records)

    print_to_file(v.maintenance_item_output_path, tagged_records, v.output_headings)
def stage_2():
    '''Filter outlier words with word2vec'''

    outlier_words_dict = defaultdict(int)
    outlier_words_pos_filtered_dict = defaultdict(int)
    single_word_freq_dict = defaultdict(int)

    # get maintenance item dict
    ngram_data = pd.read_excel(v.maint_item_filtering_stage_1_path, sheet_name=v.input_file_sheet_name)
    maintenance_items = pd.DataFrame(ngram_data, columns=v.ngrams_headings)

    # get preprocessed maitenance records
    preprocessed_data = pd.read_excel(v.transformed_text_path_stage_4, sheet_name=v.input_file_sheet_name)
    selected_data = pd.DataFrame(preprocessed_data, columns=v.input_file_columns)
    transformed_text_list = list(selected_data[v.input_file_columns])

    # get generated model
    model = gensim.models.Word2Vec.load(v.word_2_vec_model_path )

    for index, row in maintenance_items.iterrows():

        row['headword'] = get_proper_string(row['headword'])
        row['tailword1'] = get_proper_string(row['tailword1'])
        row['tailword2'] = get_proper_string(row['tailword2'])
        row['tailword3'] = get_proper_string(row['tailword3'])
        row['tailword4'] = get_proper_string(row['tailword4'])
        row['tailword5'] = get_proper_string(row['tailword5'])

        combined = (row['headword'] + ' ' + row['tailword1'] + ' ' +
                                                 row['tailword2'] + ' '+row['tailword3']+
                                                 ' ' +row['tailword4']+' '+row['tailword5']).strip()

        parts_list = combined.split(' ')
        number_of_parts = len(parts_list)


        for a in parts_list:

            #print(parts_list)

            single_word_freq_dict[a] += 1

            if a not in model.wv.vocab:
                number_of_parts -= 1

        if number_of_parts == 0:
            outlier_word = ''
            pos = ''
            dist_mean = 0
        else:
            outlier_word , dist_mean = customized_doesnt_match(model.wv, parts_list)
            outlier_words_dict[outlier_word] += 1
            outlier_words_pos_filtered_dict[outlier_word] += 1

    outlier_words_pos_filtered_dic_ranked_by_ratio = {}
    outlier_words_ratio_2 = {}
    for item in outlier_words_dict:
        freq, ratio = 0, 0
        if item in single_word_freq_dict:
            freq = single_word_freq_dict[item]
            ratio_unfiltered = outlier_words_dict[item] / freq
            ratio = outlier_words_pos_filtered_dict[item] / freq
        outlier_words_pos_filtered_dic_ranked_by_ratio[item] = ratio
        outlier_words_ratio_2[item] = ratio_unfiltered

    ratio_threshold = 0.8
    stopwords_stage_2 = [a for a in outlier_words_pos_filtered_dic_ranked_by_ratio if outlier_words_pos_filtered_dic_ranked_by_ratio[a] > ratio_threshold]
    print(stopwords_stage_2)

    final = []

    for index, row in maintenance_items.iterrows():

        row['headword'] = get_proper_string(row['headword'])
        row['tailword1'] = get_proper_string(row['tailword1'])
        row['tailword2'] = get_proper_string(row['tailword2'])
        row['tailword3'] = get_proper_string(row['tailword3'])
        row['tailword4'] = get_proper_string(row['tailword4'])
        row['tailword5'] = get_proper_string(row['tailword5'])

        combined = (row['headword'] + ' ' + row['tailword1'] + ' ' +
                        row['tailword2'] + ' ' + row['tailword3'] +
                        ' ' + row['tailword4'] + ' ' + row['tailword5']).strip()

        found = 0;
        combined_list = [row['headword'], row['tailword1'], row['tailword2'], row['tailword3'], row['tailword4'], row['tailword5']]

        for stopword in stopwords_stage_2:
            if stopword in combined_list:
                print(combined_list)
                found = 1;

        if found == 0:
            final.append(combined)

    print_to_file(v.maint_item_filtering_stage_2_path, final, v.dictionary_headings);