def getDatasetsOfDatasetDocumentFromDatabase( dataset_document_name=DEFAULT_DATASET_DOCUMENT_NAME, db_name=DEFAULT_STATISTICS_DB_NAME): """ Gets the given dataset from database. :param dataset_document_name - Name of dataset to get from database :param db_name: Name of database to get dataset from. :return datasets - List of lists [[DEFAULT_DATASET_LIST_INDEX_TRAINING], [DEFAULT_DATASET_LIST_INDEX_TEST]] """ database = db.couch_database(db_name) document = database.getDocumentFromDatabase(dataset_document_name) datasets = [] datasets.insert(DEFAULT_DATASET_LIST_INDEX_TRAINING, document[DATASET_DOCUMENT_FIELD_NAME_TRAINING_SET]) datasets.insert(DEFAULT_DATASET_LIST_INDEX_TEST, document[DATASET_DOCUMENT_FIELD_NAME_TEST_SET]) return datasets
def addWordDictionariesToDocumentsOfDatabase(db_name, fields): """ Creates an word dictionary (word:occurences) from all documents in given db_name of given fields. :param db_name: The name of database to add word dictionaries :param fields: The fields of documents to use for word dictionary """ database = db.couch_database(db_name) all_docs = database.getAllDocumentsFromDatabase() pattern = preprocessor.getWhiteListPattern() for row in all_docs.rows: document = row.doc for field in fields: counts = Counter() #sadly view _all_docs also gives design docs, so we have to check if field in row.doc.keys(): doc_field = document[field] for word in doc_field.split(): counts.update(word.lower() for word in re.split(pattern, word)) word_dictionary = dict(counts) new_field = getFieldForWordDictionaryByDocumentField(field) document[new_field] = word_dictionary database.updateDocumentInDatabase(document)
def cleanFieldsOfDocumentsFromDatabase(db_name, fields): """ Find and replace all (html-) tags from given fields in given database :param db_name - The db in which tags should be replaced :param fields - List fields from which html tags should be replaced """ pattern = getWhiteListPattern() database = db.couch_database(db_name) all_docs = database.getAllDocumentsFromDatabase() for row in all_docs.rows: document = row.doc for field in fields: #sadly view _all_docs also gives design docs, so we have to check if field in row.doc.keys(): doc_field = document[field] cleaned_doc_field = re.sub(pattern, ' ',doc_field) document[field] = cleaned_doc_field database.updateDocumentInDatabase(document)
def addWordDictionariesToDocumentsOfDatabase(db_name, fields): """ Creates an word dictionary (word:occurences) from all documents in given db_name of given fields. :param db_name: The name of database to add word dictionaries :param fields: The fields of documents to use for word dictionary """ database = db.couch_database(db_name) all_docs = database.getAllDocumentsFromDatabase() pattern = preprocessor.getWhiteListPattern() for row in all_docs.rows: document = row.doc for field in fields: counts = Counter() #sadly view _all_docs also gives design docs, so we have to check if field in row.doc.keys(): doc_field = document[field] for word in doc_field.split(): counts.update(word.lower() for word in re.split(pattern,word)) word_dictionary = dict(counts) new_field = getFieldForWordDictionaryByDocumentField(field) document[new_field] = word_dictionary database.updateDocumentInDatabase(document)
def perform_train_test_split(db_name=ds.DEFAULT_DB_NAME, train_size=ds.DEFAULT_TRAININGSET_SIZE): """ Get all document_ids of given database and split's it according to given train_size. The tricky part is that we n :param db_name: Name of database to split documents (default DEFAULT_DB_NAME) :param train_size: Size in percentage [0,1] of the training set. :return splitted_dataset - List of lists [[DEFAULT_DATASET_LIST_INDEX_TRAINING], [DEFAULT_DATASET_LIST_INDEX_TEST]] """ database = db.couch_database(db_name) all_docs = database.getAllDocumentsFromDatabase() doc_ids_list = [] all_tag_list = [] i = 0 for row in all_docs.rows: document = row.doc #append the document id to doc_ids_list doc_ids_list.append(document[cp.COUCHDB_DOCUMENT_FIELD_ID]) tag_list = [] #if document has tags than split and add them if pp.STACKEXCHANGE_TAGS_COLUM in document.keys(): document_tags = document[pp.STACKEXCHANGE_TAGS_COLUM] tags_list = document_tags.split(sep=dtm_provider.TAG_SPLIT_separator) for tag in tags_list: #remove the closing tag (last item) tag_list.append(tag[:-1]) #append the list of document tags to all_tag_list all_tag_list.append(tag_list) i += 1 if i > 10000: break mlb = MultiLabelBinarizer() tags_encoded = mlb.fit_transform(all_tag_list) print(len(doc_ids_list)) splitted_dataset = cross_validation.train_test_split(doc_ids_list,tags_encoded, train_size=0.8, random_state=42, stratify=tags_encoded)
def insertWordDictionaryIntoDatabase(db_name, document): """ Insert a a worddictionary document into db with given name :param db_name: The name of database to insert document to :param document: The document to insert into database """ database = db.couch_database(db_name) database.insertDocumentIntoDatabase(document)
def readJsonFromFileAndImportIntoDatabase(db_name, file): """ Reads json from given file and imports it into given db_name :param db_name: Name of database to import json content of file to :param file: File incl. name and path to import json content from """ database = db.couch_database(db_name) database.bulkInsertDocumentsFromFile(file)
def performFeatureSelection(): #1.) replicate our current clean database to new database database = db.couch_database(cp.COUCHDB_CLEANED_NAME) #2.) Now we work with new clean database db_name = cp.COUCHDB_CLEANED_WD_NAME database.replicateCouchDBToNewCouchDB(db_name) #3.) Define the fields we want to create word dictionaries of fields = [pp.STACKEXCHANGE_TITLE_COLUMN, pp.STACKEXCHANGE_BODY_COLUMN] #4) Add the word dictionaries of given fields to documents wd.addWordDictionariesToDocumentsOfDatabase(db_name, fields)
def performPreprocessor(): """ Equivalent to the main method, but it needed to be called separate Performs all relevant preprocessing steps """ import_file = pp.RESOURCES_FOLDER + pp.STACKEXCHANGE_JSON_FILE readJsonFromFileAndImportIntoDatabase(cp.COUCHDB_RAW_NAME,import_file) importDesignDocumentIntoGivenDatabase(cp.COUCHDB_RAW_NAME) db_name = cp.COUCHDB_CLEAN_HTML_NAME FIELDS = [pp.STACKEXCHANGE_TITLE_COLUMN, pp.STACKEXCHANGE_BODY_COLUMN] database = db.couch_database(cp.COUCHDB_RAW_NAME) database.replicateCouchDBToNewCouchDB(db_name) cleanHTMLTagsFromDocumentsInDatabase(db_name, FIELDS) database = db.couch_database(db_name) database.replicateCouchDBToNewCouchDB(cp.COUCHDB_CLEANED_NAME) db_name = cp.COUCHDB_CLEANED_NAME cleanFieldsOfDocumentsFromDatabase(db_name, FIELDS)
def importDesignDocumentIntoGivenDatabase(dbname): """ Imports the default design document into given database :param db_name: Name of database to import design document to """ #path to design_document_file design_document_file = pp.RESOURCES_FOLDER + pp.STACKEXCHANGE_DESIGN_DOCUMENT database = db.couch_database(dbname=dbname) stored_id = database.insertDesignDocumentFromGivenPath(design_document_file) print("ID of stored doc: " + str(stored_id))
def getDatasetContentDocumentFromDatabase(dataset_document_name, dataset_name, used_fields, doc_type=DEFAULT_DSCD_TYPE, db_name=DEFAULT_STATISTICS_DB_NAME): """ Gets the required dataset content list from database :param dataset_document_name - The name of dataset_document the dataset content document was created of. :param dataset_name - The dataset to retrieve (training|test) :param used_fields - The used fields for creating the dataset content doc :parm doc_type - Default set to DEFAULT_DSCD_TYPE :param db_name - Default set to DEFAULT_STATISTICS_DB_NAME :return dataset_content_document """ database = db.couch_database(db_name) #we need all documents from the statistics db all_docs = database.getAllDocumentsFromDatabase() #we only need the one with used fields for row in all_docs.rows: document = row.doc document_keys = document.keys() #check if all needed fields exist if DSCD_FIELD_DATASET_DOCUMENT_USED in document_keys \ and DSCD_FIELD_TYPE in document_keys \ and DSCD_FIELD_DATASET_NAME in document_keys \ and DSCD_FIELD_USED_FIELDS in document_keys: #the document we want needs to have the matching #dataset_document_name type, dataset_name and the correct #used fields if dataset_document_name == document[DSCD_FIELD_DATASET_DOCUMENT_USED]\ and doc_type == document[DSCD_FIELD_TYPE] \ and dataset_name == document[DSCD_FIELD_DATASET_NAME] \ and used_fields == document[DSCD_FIELD_USED_FIELDS]: return document
def perform_train_test_split(db_name=DEFAULT_DB_NAME, train_size=DEFAULT_TRAININGSET_SIZE): """ Get all document_ids of given database and split's it according to given train_size. :param db_name: Name of database to split documents (default DEFAULT_DB_NAME) :param train_size: Size in percentage [0,1] of the training set. :return splitted_dataset - List of lists [[DEFAULT_DATASET_LIST_INDEX_TRAINING], [DEFAULT_DATASET_LIST_INDEX_TEST]] """ database = db.couch_database(db_name) all_doc_ids = database.getAllDocumentIdsFromDatabase() splitted_dataset = cross_validation.train_test_split(all_doc_ids, train_size=train_size, random_state=42) return splitted_dataset
def cleanHTMLTagsFromDocumentsInDatabase(db_name, fields): """ Find and replace all (html-) tags from given fields in given database :param db_name - The db in which tags should be replaced :param fields - List fields from which html tags should be replaced """ #see source: #http://kevin.deldycke.com/2008/07/python-ultimate-regular-expression-to-catch-html-tags/ ultimate_regexp = """(?i)<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>""" html_tag_pattern = re.compile(ultimate_regexp) database = db.couch_database(db_name) all_docs = database.getAllDocumentsFromDatabase() for row in all_docs.rows: #print("ID of document: " + str(row.id)) document = row.doc for field in fields: #sadly view _all_docs also gives design docs, so we have to check if field in row.doc.keys(): doc_field = document[field] if re.findall(html_tag_pattern, doc_field): cleaned_doc_field = re.sub(html_tag_pattern,'', doc_field) document[field] = cleaned_doc_field database.updateDocumentInDatabase(document)
def getAllStoredWordDictionaryDocumentsFromDatabase(db_name): """ Loads the word statistic documents from given database and returns list of word dictionaries :param db_name: The database to load the word statistics from :return [list] of {word_dictionaries} """ database = db.couch_database(db_name) all_docs = database.getAllDocumentsFromDatabase() word_dictionary_list = [] for row in all_docs.rows: document = row.doc if row.doc[cp.COUCHDB_DOCUMENT_FIELD_ID] != cp.DESIGN_DOCUMENT_ID: document = row.doc word_dictionary_list.append(document) return word_dictionary_list
def insertSplittedDatasetToDatabase(dataset_name, splitted_dataset, db_name=DEFAULT_STATISTICS_DB_NAME): """ Stores the given dataset with given dataset_name into given database. :param dataset_name - Name of dataset to store :param splitted_dataset - The dataset to store as list of lists [[DEFAULT_DATASET_LIST_INDEX_TRAINING][DEFAULT_DATASET_LIST_INDEX_TEST]] :param db_name: Name of database to split documents. """ database = db.couch_database(db_name) dataset_document = {} dataset_document[DATASET_DOCUMENT_FIELD_NAME] = dataset_name dataset_document[DATASET_DOCUMENT_FIELD_TYPE] = DATASET_DOCUMENT_TYPE dataset_document[DATASET_DOCUMENT_FIELD_NAME_TRAINING_SET] = \ splitted_dataset[DEFAULT_DATASET_LIST_INDEX_TRAINING] dataset_document[DATASET_DOCUMENT_FIELD_NAME_TEST_SET] = \ splitted_dataset[DEFAULT_DATASET_LIST_INDEX_TEST] database.insertDocumentIntoDatabase(dataset_document)
def createWordDictionaryFromDatabaseContent(db_name, fields): """ Creates an word dictionary (word:occurences) from all documents in given db_name of given fields. :param db_name: The name of database to create word dictionary from : param fields: The fields of documents to use for word dictionary """ database = db.couch_database(db_name) all_docs = database.getAllDocumentsFromDatabase() counts = Counter() #TODO replace pattern = preprocessor.getWhiteListPattern() for row in all_docs.rows: document = row.doc for field in fields: #sadly view _all_docs also gives design docs, so we have to check if field in row.doc.keys(): doc_field = document[field] for word in doc_field.split(): counts.update(word.lower() for word in re.split(pattern, word)) word_dictionary = dict(counts) return word_dictionary
def insertDatasetContentDocumentInDatabase(dataset_document_used, dataset_name, used_fields, index_dictionary, document_content, tag_index_dictionary, document_tags, tags_list, db_name=DEFAULT_STATISTICS_DB_NAME): document = {} document[DSCD_FIELD_DATASET_DOCUMENT_USED] = dataset_document_used document[DSCD_FIELD_DATASET_NAME] = dataset_name document[DSCD_FIELD_USED_FIELDS] = used_fields document[DSCD_FIELD_TYPE] = DEFAULT_DSCD_TYPE document[DSCD_FIELD_INDEX_DICTIONARY] = index_dictionary document[DSCD_FIELD_CONTENT] = document_content document[DSCD_FIELD_TAG_INDEX_DICTIONARY] = tag_index_dictionary document[DSCD_FIELD_CONTENT_TAGS] = document_tags document[DSCD_FIELD_TAGS_LIST] = tags_list database = db.couch_database(db_name) id = database.insertDocumentIntoDatabase(document) print(id)
def createWordDictionaryFromDatabaseContent(db_name, fields): """ Creates an word dictionary (word:occurences) from all documents in given db_name of given fields. :param db_name: The name of database to create word dictionary from : param fields: The fields of documents to use for word dictionary """ database = db.couch_database(db_name) all_docs = database.getAllDocumentsFromDatabase() counts = Counter() #TODO replace pattern = preprocessor.getWhiteListPattern() for row in all_docs.rows: document = row.doc for field in fields: #sadly view _all_docs also gives design docs, so we have to check if field in row.doc.keys(): doc_field = document[field] for word in doc_field.split(): counts.update(word.lower() for word in re.split(pattern,word)) word_dictionary = dict(counts) return word_dictionary
def buildDatasetContentListsOfDataset(dataset, document_fields): """ Builds the dataset content list of given dataset. This is the structure we use later for the sklearn Vectorizer (for document term matrix) :param dataset - Dataset to build dataset content list from :parum document_fields - List of Lists [[]] for the fields of document to use. :return index_dictionary - Dictionary{} key:docId, values:index in dataset_content_list :return dataset_content - The content (from document fields #document_fields of all documents as list :return tag_index_dictionary - Dictionary{] keys:tag, values: index in taglist :return dataset_content_tags - All tags of given documents :return tags_list - List[] of all tags """ database = db.couch_database(DEFAULT_DATABASE) all_doc_ids = database.getDocumentsForGivenIds(dataset) #stores an item for content of document dataset_content = [] dataset_content_tags = [] #key=document_id, value=index in content list index_dictionary = {} #key=tag, value=index in tag_list tag_index_dictionary = {} #list of tags dataset_tags = [] for row in all_doc_ids.rows: document = row.doc document_id = document.id document_content = '' #"sum" the text for an document over all fields #workaround to check that title and body exist if pp.STACKEXCHANGE_TITLE_COLUMN in document.keys(): for doc_field in document_fields: if doc_field in document.keys(): document_content = document_content + document[ doc_field] + ' - ' dataset_content.append(document_content) index_dictionary[document_id] = len(dataset_content) - 1 if pp.STACKEXCHANGE_TAGS_COLUM in document.keys(): document_tags = document[pp.STACKEXCHANGE_TAGS_COLUM] dataset_content_tags.append(document_tags) tags_list = provide_tag_list_of_tag_content(document_tags) for tag in tags_list: # tag does not exist in dict, add it to list and dictionary if tag not in tag_index_dictionary.keys(): dataset_tags.append(tag) tag_index_dictionary[tag] = len(dataset_tags) - 1 return index_dictionary, dataset_content, tag_index_dictionary, \ dataset_content_tags, dataset_tags