コード例 #1
0
def getDatasetsOfDatasetDocumentFromDatabase(
                            dataset_document_name=DEFAULT_DATASET_DOCUMENT_NAME, 
                            db_name=DEFAULT_STATISTICS_DB_NAME):
    
    """
    Gets the given dataset from database.
    
    :param dataset_document_name - Name of dataset to get from database
    :param db_name: Name of database to get dataset from.
    
    :return datasets - List of lists 
                    [[DEFAULT_DATASET_LIST_INDEX_TRAINING], 
                    [DEFAULT_DATASET_LIST_INDEX_TEST]]
    """
                                
    database = db.couch_database(db_name)
    
    document = database.getDocumentFromDatabase(dataset_document_name)
    
    datasets = []
    
    datasets.insert(DEFAULT_DATASET_LIST_INDEX_TRAINING, 
                   document[DATASET_DOCUMENT_FIELD_NAME_TRAINING_SET])
    datasets.insert(DEFAULT_DATASET_LIST_INDEX_TEST, 
                   document[DATASET_DOCUMENT_FIELD_NAME_TEST_SET])
    
    return datasets
コード例 #2
0
def addWordDictionariesToDocumentsOfDatabase(db_name, fields):
    """
    Creates an word dictionary (word:occurences) from all documents in given
    db_name of given fields.
    
    :param db_name: The name of database to add word dictionaries
    :param fields: The fields of documents to use for word dictionary
    """

    database = db.couch_database(db_name)
    all_docs = database.getAllDocumentsFromDatabase()
    pattern = preprocessor.getWhiteListPattern()

    for row in all_docs.rows:

        document = row.doc

        for field in fields:

            counts = Counter()

            #sadly view _all_docs also gives design docs, so we have to check
            if field in row.doc.keys():

                doc_field = document[field]

                for word in doc_field.split():
                    counts.update(word.lower()
                                  for word in re.split(pattern, word))

                word_dictionary = dict(counts)
                new_field = getFieldForWordDictionaryByDocumentField(field)
                document[new_field] = word_dictionary

        database.updateDocumentInDatabase(document)
コード例 #3
0
def cleanFieldsOfDocumentsFromDatabase(db_name, fields):
    
    """
    Find and replace all (html-) tags from given fields in given database
    
    :param db_name - The db in which tags should be replaced
    :param fields - List fields from which html tags should be replaced
    
    """
    
    pattern = getWhiteListPattern()
    
    database = db.couch_database(db_name)
    
    all_docs = database.getAllDocumentsFromDatabase()

    for row in all_docs.rows:
        
        document = row.doc
        
        for field in fields:
            
            #sadly view _all_docs also gives design docs, so we have to check
            if field in row.doc.keys():
        
                doc_field = document[field]
                cleaned_doc_field = re.sub(pattern, ' ',doc_field)
                document[field] = cleaned_doc_field
            
        database.updateDocumentInDatabase(document)
コード例 #4
0
def addWordDictionariesToDocumentsOfDatabase(db_name, fields):
    
    """
    Creates an word dictionary (word:occurences) from all documents in given
    db_name of given fields.
    
    :param db_name: The name of database to add word dictionaries
    :param fields: The fields of documents to use for word dictionary
    """
    
    database = db.couch_database(db_name)  
    all_docs = database.getAllDocumentsFromDatabase()
    pattern = preprocessor.getWhiteListPattern()
    
    for row in all_docs.rows:
      
        document = row.doc
        
        for field in fields:
          
            counts = Counter()
            
            #sadly view _all_docs also gives design docs, so we have to check
            if field in row.doc.keys():
                
                doc_field = document[field]
                
                for word in doc_field.split():
                    counts.update(word.lower() for word in re.split(pattern,word))
    
                word_dictionary =  dict(counts)
                new_field = getFieldForWordDictionaryByDocumentField(field)
                document[new_field] = word_dictionary
               
        database.updateDocumentInDatabase(document)
def perform_train_test_split(db_name=ds.DEFAULT_DB_NAME,
                                        train_size=ds.DEFAULT_TRAININGSET_SIZE):
    
    """
    Get all document_ids of given database and split's it according to given
    train_size.
    The tricky part is that we n
    
    :param db_name: Name of database to split documents (default DEFAULT_DB_NAME)
    :param train_size: Size in percentage [0,1] of the training set.
    :return splitted_dataset - List of lists 
                    [[DEFAULT_DATASET_LIST_INDEX_TRAINING], 
                    [DEFAULT_DATASET_LIST_INDEX_TEST]]
    """
    
    database = db.couch_database(db_name)
    all_docs = database.getAllDocumentsFromDatabase()
    
    doc_ids_list = []
    all_tag_list = []
    
    i = 0
    
    for row in all_docs.rows:
        
        document = row.doc
        #append the document id to doc_ids_list
        doc_ids_list.append(document[cp.COUCHDB_DOCUMENT_FIELD_ID])
        
        tag_list = []
        
        #if document has tags than split and add them
        if pp.STACKEXCHANGE_TAGS_COLUM in document.keys():
            
            document_tags = document[pp.STACKEXCHANGE_TAGS_COLUM]
            
            tags_list = document_tags.split(sep=dtm_provider.TAG_SPLIT_separator)
            
            for tag in tags_list:
                
                #remove the closing tag (last item)
                tag_list.append(tag[:-1])
        #append the list of document tags to all_tag_list        
        all_tag_list.append(tag_list)
        
        i += 1
        
        if i > 10000:
            break
    
    mlb = MultiLabelBinarizer()
    tags_encoded = mlb.fit_transform(all_tag_list)

    
    print(len(doc_ids_list))
    
    splitted_dataset = cross_validation.train_test_split(doc_ids_list,tags_encoded,
                                               train_size=0.8, random_state=42, 
                                               stratify=tags_encoded)
コード例 #6
0
def insertWordDictionaryIntoDatabase(db_name, document):
    """
    Insert a a worddictionary document into db with given name
    
    :param db_name: The name of database to insert document to
    :param document: The document to insert into database
    """

    database = db.couch_database(db_name)

    database.insertDocumentIntoDatabase(document)
コード例 #7
0
def readJsonFromFileAndImportIntoDatabase(db_name, file):
    
    """
    Reads json from given file and imports it into given db_name
    
    :param db_name: Name of database to import json content of file to
    :param file: File incl. name and path to import json content from
    """
    
    database = db.couch_database(db_name)
    
    database.bulkInsertDocumentsFromFile(file)
コード例 #8
0
def insertWordDictionaryIntoDatabase(db_name, document):
    
    """
    Insert a a worddictionary document into db with given name
    
    :param db_name: The name of database to insert document to
    :param document: The document to insert into database
    """
       
    database = db.couch_database(db_name)
        
    database.insertDocumentIntoDatabase(document)
コード例 #9
0
def performFeatureSelection():
    
    #1.) replicate our current clean database to new database
    database = db.couch_database(cp.COUCHDB_CLEANED_NAME)
    
    #2.) Now we work with new clean database
    db_name = cp.COUCHDB_CLEANED_WD_NAME
    database.replicateCouchDBToNewCouchDB(db_name)
    
    #3.) Define the fields we want to create word dictionaries of
    fields = [pp.STACKEXCHANGE_TITLE_COLUMN, pp.STACKEXCHANGE_BODY_COLUMN]
    
    #4) Add the word dictionaries of given fields to documents
    wd.addWordDictionariesToDocumentsOfDatabase(db_name, fields)
コード例 #10
0
def performFeatureSelection():

    #1.) replicate our current clean database to new database
    database = db.couch_database(cp.COUCHDB_CLEANED_NAME)

    #2.) Now we work with new clean database
    db_name = cp.COUCHDB_CLEANED_WD_NAME
    database.replicateCouchDBToNewCouchDB(db_name)

    #3.) Define the fields we want to create word dictionaries of
    fields = [pp.STACKEXCHANGE_TITLE_COLUMN, pp.STACKEXCHANGE_BODY_COLUMN]

    #4) Add the word dictionaries of given fields to documents
    wd.addWordDictionariesToDocumentsOfDatabase(db_name, fields)
コード例 #11
0
def performPreprocessor():
    
    """
    Equivalent to the main method, but it needed to be called separate
    Performs all relevant preprocessing steps
    """
    import_file = pp.RESOURCES_FOLDER + pp.STACKEXCHANGE_JSON_FILE
    readJsonFromFileAndImportIntoDatabase(cp.COUCHDB_RAW_NAME,import_file)
    importDesignDocumentIntoGivenDatabase(cp.COUCHDB_RAW_NAME)
    
    db_name = cp.COUCHDB_CLEAN_HTML_NAME
    FIELDS = [pp.STACKEXCHANGE_TITLE_COLUMN, pp.STACKEXCHANGE_BODY_COLUMN]
    
    database = db.couch_database(cp.COUCHDB_RAW_NAME)
    database.replicateCouchDBToNewCouchDB(db_name)
    
    cleanHTMLTagsFromDocumentsInDatabase(db_name, FIELDS)
    
    database = db.couch_database(db_name)
    database.replicateCouchDBToNewCouchDB(cp.COUCHDB_CLEANED_NAME)
    
    db_name = cp.COUCHDB_CLEANED_NAME
    cleanFieldsOfDocumentsFromDatabase(db_name, FIELDS)
コード例 #12
0
def importDesignDocumentIntoGivenDatabase(dbname):
    
    """
    Imports the default design document into given database
    
    :param db_name: Name of database to import design document to
    """
    
    #path to design_document_file
    design_document_file = pp.RESOURCES_FOLDER + pp.STACKEXCHANGE_DESIGN_DOCUMENT

    database = db.couch_database(dbname=dbname)
    
    stored_id = database.insertDesignDocumentFromGivenPath(design_document_file)
    
    print("ID of stored doc: " + str(stored_id))
def getDatasetContentDocumentFromDatabase(dataset_document_name,
                                          dataset_name,
                                          used_fields,
                                          doc_type=DEFAULT_DSCD_TYPE,
                                          db_name=DEFAULT_STATISTICS_DB_NAME):
    """
    
    Gets the required dataset content list from database
    
    :param dataset_document_name - The name of dataset_document the dataset
    content document was created of.
    :param dataset_name - The dataset to retrieve (training|test)
    :param used_fields - The used fields for creating the dataset content doc
    :parm doc_type - Default set to DEFAULT_DSCD_TYPE
    :param db_name - Default set to DEFAULT_STATISTICS_DB_NAME
    
    :return dataset_content_document
    
    """

    database = db.couch_database(db_name)
    #we need all documents from the statistics db
    all_docs = database.getAllDocumentsFromDatabase()

    #we only need the one with used fields

    for row in all_docs.rows:

        document = row.doc

        document_keys = document.keys()

        #check if all needed fields exist
        if DSCD_FIELD_DATASET_DOCUMENT_USED in document_keys \
            and DSCD_FIELD_TYPE in document_keys \
            and DSCD_FIELD_DATASET_NAME in document_keys \
            and DSCD_FIELD_USED_FIELDS in document_keys:

            #the document we want needs to have the matching
            #dataset_document_name type, dataset_name and the correct
            #used fields
            if  dataset_document_name == document[DSCD_FIELD_DATASET_DOCUMENT_USED]\
                and doc_type == document[DSCD_FIELD_TYPE] \
                and dataset_name == document[DSCD_FIELD_DATASET_NAME] \
                and used_fields == document[DSCD_FIELD_USED_FIELDS]:

                return document
コード例 #14
0
def perform_train_test_split(db_name=DEFAULT_DB_NAME,
                                        train_size=DEFAULT_TRAININGSET_SIZE):
    
    """
    Get all document_ids of given database and split's it according to given
    train_size.
    
    :param db_name: Name of database to split documents (default DEFAULT_DB_NAME)
    :param train_size: Size in percentage [0,1] of the training set.
    :return splitted_dataset - List of lists 
    [[DEFAULT_DATASET_LIST_INDEX_TRAINING], [DEFAULT_DATASET_LIST_INDEX_TEST]]
    """
    
    database = db.couch_database(db_name)
    all_doc_ids = database.getAllDocumentIdsFromDatabase()
   
    splitted_dataset = cross_validation.train_test_split(all_doc_ids, 
                                               train_size=train_size,
                                               random_state=42)
    return splitted_dataset
コード例 #15
0
def cleanHTMLTagsFromDocumentsInDatabase(db_name, fields):
    
    """
    Find and replace all (html-) tags from given fields in given database
    
    :param db_name - The db in which tags should be replaced
    :param fields - List fields from which html tags should be replaced
    
    """
    
    #see source: 
    #http://kevin.deldycke.com/2008/07/python-ultimate-regular-expression-to-catch-html-tags/
    ultimate_regexp = """(?i)<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[^'\">\s]+))?)+\s*|\s*)\/?>"""
    
    html_tag_pattern = re.compile(ultimate_regexp)
    
    database = db.couch_database(db_name)
    
    all_docs = database.getAllDocumentsFromDatabase()

    for row in all_docs.rows:
        
        #print("ID of document: " + str(row.id))
        
        document = row.doc
        
        for field in fields:
            
            #sadly view _all_docs also gives design docs, so we have to check
            if field in row.doc.keys():
        
                doc_field = document[field]
                
                if re.findall(html_tag_pattern, doc_field):
                    
                    cleaned_doc_field = re.sub(html_tag_pattern,'', doc_field)
                    document[field] = cleaned_doc_field
                    
        database.updateDocumentInDatabase(document)
コード例 #16
0
def getAllStoredWordDictionaryDocumentsFromDatabase(db_name):
    """
    Loads the word statistic documents from given database and returns list
    of word dictionaries
    
    :param db_name: The database to load the word statistics from
    
    :return [list] of {word_dictionaries}
    """

    database = db.couch_database(db_name)
    all_docs = database.getAllDocumentsFromDatabase()
    word_dictionary_list = []

    for row in all_docs.rows:

        document = row.doc

        if row.doc[cp.COUCHDB_DOCUMENT_FIELD_ID] != cp.DESIGN_DOCUMENT_ID:
            document = row.doc
            word_dictionary_list.append(document)

    return word_dictionary_list
コード例 #17
0
def insertSplittedDatasetToDatabase(dataset_name, splitted_dataset, 
                                db_name=DEFAULT_STATISTICS_DB_NAME):
    
    """
    Stores the given dataset with given dataset_name into given database.
    
    :param dataset_name - Name of dataset to store
    :param splitted_dataset - The dataset to store as list of lists 
                        [[DEFAULT_DATASET_LIST_INDEX_TRAINING][DEFAULT_DATASET_LIST_INDEX_TEST]]
    :param db_name: Name of database to split documents.
    """
    
    database = db.couch_database(db_name)
    
    dataset_document = {}
    dataset_document[DATASET_DOCUMENT_FIELD_NAME] = dataset_name
    dataset_document[DATASET_DOCUMENT_FIELD_TYPE] = DATASET_DOCUMENT_TYPE
    dataset_document[DATASET_DOCUMENT_FIELD_NAME_TRAINING_SET] = \
                                splitted_dataset[DEFAULT_DATASET_LIST_INDEX_TRAINING]
    dataset_document[DATASET_DOCUMENT_FIELD_NAME_TEST_SET] = \
                                splitted_dataset[DEFAULT_DATASET_LIST_INDEX_TEST]
                                        
    database.insertDocumentIntoDatabase(dataset_document)
コード例 #18
0
def getAllStoredWordDictionaryDocumentsFromDatabase(db_name):
    
    """
    Loads the word statistic documents from given database and returns list
    of word dictionaries
    
    :param db_name: The database to load the word statistics from
    
    :return [list] of {word_dictionaries}
    """
    
    database = db.couch_database(db_name)
    all_docs = database.getAllDocumentsFromDatabase()
    word_dictionary_list = []
    
    for row in all_docs.rows:
    
        document = row.doc
        
        if row.doc[cp.COUCHDB_DOCUMENT_FIELD_ID] != cp.DESIGN_DOCUMENT_ID:
            document = row.doc           
            word_dictionary_list.append(document)
    
    return word_dictionary_list
コード例 #19
0
def createWordDictionaryFromDatabaseContent(db_name, fields):
    """
    Creates an word dictionary (word:occurences) from all documents in given
    db_name of given fields.
    
    :param db_name: The name of database to create word dictionary from
    : param fields: The fields of documents to use for word dictionary
    """

    database = db.couch_database(db_name)

    all_docs = database.getAllDocumentsFromDatabase()

    counts = Counter()

    #TODO replace
    pattern = preprocessor.getWhiteListPattern()

    for row in all_docs.rows:

        document = row.doc

        for field in fields:

            #sadly view _all_docs also gives design docs, so we have to check
            if field in row.doc.keys():

                doc_field = document[field]

                for word in doc_field.split():
                    counts.update(word.lower()
                                  for word in re.split(pattern, word))

    word_dictionary = dict(counts)

    return word_dictionary
def insertDatasetContentDocumentInDatabase(dataset_document_used,
                                           dataset_name,
                                           used_fields,
                                           index_dictionary,
                                           document_content,
                                           tag_index_dictionary,
                                           document_tags,
                                           tags_list,
                                           db_name=DEFAULT_STATISTICS_DB_NAME):
    document = {}
    document[DSCD_FIELD_DATASET_DOCUMENT_USED] = dataset_document_used
    document[DSCD_FIELD_DATASET_NAME] = dataset_name
    document[DSCD_FIELD_USED_FIELDS] = used_fields
    document[DSCD_FIELD_TYPE] = DEFAULT_DSCD_TYPE
    document[DSCD_FIELD_INDEX_DICTIONARY] = index_dictionary
    document[DSCD_FIELD_CONTENT] = document_content
    document[DSCD_FIELD_TAG_INDEX_DICTIONARY] = tag_index_dictionary
    document[DSCD_FIELD_CONTENT_TAGS] = document_tags
    document[DSCD_FIELD_TAGS_LIST] = tags_list

    database = db.couch_database(db_name)
    id = database.insertDocumentIntoDatabase(document)

    print(id)
コード例 #21
0
def createWordDictionaryFromDatabaseContent(db_name, fields):
    
    """
    Creates an word dictionary (word:occurences) from all documents in given
    db_name of given fields.
    
    :param db_name: The name of database to create word dictionary from
    : param fields: The fields of documents to use for word dictionary
    """
    
    database = db.couch_database(db_name)
    
    all_docs = database.getAllDocumentsFromDatabase()
    
    counts = Counter()
    
    #TODO replace
    pattern = preprocessor.getWhiteListPattern()
    
    for row in all_docs.rows:
        
        document = row.doc
        
        for field in fields:
            
            #sadly view _all_docs also gives design docs, so we have to check
            if field in row.doc.keys():
                
                doc_field = document[field]
                
                for word in doc_field.split():
                    counts.update(word.lower() for word in re.split(pattern,word))
    
    word_dictionary =  dict(counts)
    
    return word_dictionary
def buildDatasetContentListsOfDataset(dataset, document_fields):
    """
    Builds the dataset content list of given dataset. This is the structure we
    use later for the sklearn Vectorizer (for document term matrix)
    
    :param dataset - Dataset to build dataset content list from
    :parum document_fields - List of Lists [[]] for the fields of document 
    to use.
    
    :return index_dictionary - Dictionary{} key:docId, values:index in 
            dataset_content_list
    :return dataset_content - The content (from document fields #document_fields 
            of all documents as list
    :return tag_index_dictionary - Dictionary{] keys:tag, values: index in 
            taglist
    :return dataset_content_tags - All tags of given documents
    :return tags_list - List[] of all tags
    """

    database = db.couch_database(DEFAULT_DATABASE)
    all_doc_ids = database.getDocumentsForGivenIds(dataset)

    #stores an item for content of document
    dataset_content = []
    dataset_content_tags = []
    #key=document_id, value=index in content list
    index_dictionary = {}
    #key=tag, value=index in tag_list
    tag_index_dictionary = {}
    #list of tags
    dataset_tags = []

    for row in all_doc_ids.rows:

        document = row.doc
        document_id = document.id
        document_content = ''

        #"sum" the text for an document over all fields

        #workaround to check that title and body exist
        if pp.STACKEXCHANGE_TITLE_COLUMN in document.keys():

            for doc_field in document_fields:

                if doc_field in document.keys():

                    document_content = document_content + document[
                        doc_field] + ' - '

            dataset_content.append(document_content)
            index_dictionary[document_id] = len(dataset_content) - 1

        if pp.STACKEXCHANGE_TAGS_COLUM in document.keys():

            document_tags = document[pp.STACKEXCHANGE_TAGS_COLUM]

            dataset_content_tags.append(document_tags)

            tags_list = provide_tag_list_of_tag_content(document_tags)

            for tag in tags_list:
                # tag does not exist in dict, add it to list and dictionary
                if tag not in tag_index_dictionary.keys():
                    dataset_tags.append(tag)
                    tag_index_dictionary[tag] = len(dataset_tags) - 1

    return index_dictionary, dataset_content, tag_index_dictionary, \
        dataset_content_tags, dataset_tags