def pre_process_text(self):
     #Creating training test and validation sets
     self.trainset_pre_processed = [
         process(sentence).lemmatize
         for sentence in self.trainset["Description"].values
     ]
     self.testset_pre_processed = [
         process(sentence).lemmatize
         for sentence in self.testset["Description"].values
     ]
Beispiel #2
0
 def run(self):
     raw_documents = self.reader.read()
     title_docs, abstract_docs = self.data_manager.parse_documents(
         raw_documents)
     title_doc_objs = pre_process.process(title_docs, self.pre_config,
                                          constants.SENTENCE_TYPE_TITLE)
     abs_doc_objs = pre_process.process(abstract_docs, self.pre_config,
                                        constants.SENTENCE_TYPE_ABSTRACT)
     doc_objects = self.data_manager.merge_documents(
         title_doc_objs, abs_doc_objs)
     dict_nern = ner.process(doc_objects, self.nern_config)
     self.writer.write(self.output_file, raw_documents, dict_nern)
def get_sents():
    sents = get_sent_dict()
    all_sents = []
    print("Creating Document objects...\n")
    documents = pre_process.process(sents, pre_config, constants.SENTENCE_TYPE_ABSTRACT)
    for doc in documents:
        for sentence in doc.sentences:
            token_list = [t.content for t in sentence.tokens]
            all_sents.append(token_list)
    print("Get all sentences complete.\n")
    return all_sents
def get_sents():
    all_sents = []

    # create list of tokens in sentences
    for dataset in datasets:
        print('Process dataset: ' + dataset)
        reader = BioCreativeReader(
            os.path.join(input_path, "cdr_" + dataset + ".txt"))
        raw_documents = reader.read()

        title_docs, abstract_docs = data_manager.parse_documents(raw_documents)

        title_doc_objs = pre_process.process(title_docs, pre_config,
                                             constants.SENTENCE_TYPE_TITLE)
        abs_doc_objs = pre_process.process(abstract_docs, pre_config,
                                           constants.SENTENCE_TYPE_ABSTRACT)
        documents = data_manager.merge_documents(title_doc_objs, abs_doc_objs)

        for doc in documents:
            for sentence in doc.sentences:
                token_list = [t.content for t in sentence.tokens]
                all_sents.append(token_list)
    return all_sents
Beispiel #5
0
def runHazus():
    entries = []
    entries.extend(root.fields.values())
    haz = pre_process.process(
        root.filename,
        entries)  # Run the Hazus script with input from user using the GUI

    print('Pre-Process RUN', haz, entries)
    if haz[0]:        popupmsg(str(haz[1][0]) + ' records sucessfully processed of '
                 + str(haz[1][1]) + ' records total.\n' \
  +str(haz[2][1])+' Building DDFs assigned.\n' \
  +str(haz[2][2])+' Content DDFs assigned.\n' \
  +str(haz[2][3])+' Inventory DDFs assigned.\n' \
  +str(haz[4][1])+' Building DDFs checked and '+str(haz[3][1])+' found valid.\n' \
  +str(haz[4][2])+' Content DDFs checked and '+str(haz[3][2])+' found valid.\n' \
  +str(haz[4][3])+' Inventory DDFs checked and '+str(haz[3][3])+' found valid.\n' \
  +'File saved to: ' + root.filename)
Beispiel #6
0
    def make_vocabulary(self, train_x):
        for document_index, words_in_document in enumerate(train_x):
            words_in_document = np.asarray(process(words_in_document))
            unique_words, count_of_words = np.unique(words_in_document,
                                                     return_counts=True)
            self.vocabulary[document_index] = {}
            for i, word in enumerate(unique_words):
                self.vocabulary[document_index][word] = count_of_words[i]
            if max_group == y:
                correct += 1

        # Timekeeping
        timed = int(time.time() - start_time)
        print("Evaluation finished in ", timed, "seconds.")

        # Accuracy
        accuracy = (correct / len(test_y)) * 100
        print("Accuracy:", accuracy)
Beispiel #7
0
    7: ridge,
    8: stacking_linear,
    9: stacking_averaging,
    10: adaBoost,
    11: gradBoost,
}

print "Movie Lens Dataset"
print "Operations"
print "0. View Users Data"
print "1. View Movies Data"
print "2. View Ratings Data"
print "3. View Preprocessed Data"
print "4. View Correlation between different columns"
print "5. Create Train and Test Dataset"
print "6. Random Forest Regression for Movies"
print "7. Ridge Regression for Movie Ratings"
print "--------------------------------------"
print " Ensembling Techniques"
print "8. Linear Stacking For Movie Ratings"
print "9. Average Stacking for Movie Ratings"
print "10. AdaBoosting for Movie Ratings"
print "11. Gradient Boosting For Movie Ratings"

print "enter your choice of Operations"
choice = input()
if (choice >= 3):
    data_frame1 = pre_process.process()

options[choice]()