Example #1
0
def write_table(data_frame,
                table_name,
                sep=',',
                iotype='fs',
                remove_tmpfile=True):

    if iotype == 'fs':
        data_frame.to_csv(nnenv.getResourcePath() + nnenv.getItem(table_name),
                          sep=sep,
                          index=False)

    elif iotype == 'db':

        ##initlize data engine
        engine = create_engine(nnenv.getConnectable())
        ##write data frame to csv tmp file
        path_tmp_file = nnenv.getItem('tmp_dir') + '/' + nnenv.getItem(
            table_name)
        data_frame.to_csv(path_tmp_file, index=False, header=False)
        ##connect database
        conn = engine.connect()
        ##initilize hive_sql
        hive_sql_ = 'LOAD DATA LOCAL INPATH \'' + path_tmp_file + '\' OVERWRITE INTO TABLE ' + nnenv.getItem(
            table_name)

        ##execute hive_sql
        result = conn.execute(hive_sql_)
        result.close()

    else:
        print('IOtype is only for db or fs')
        raise (Exception)
Example #2
0
def main():
    # pre-define path & variables
    corpus_raw = nn.Dataframefactory('labeledContent',sep = '|',iotype='db',con=nnenv.getItem('mysql_url'))
    vector = "vectorizer.joblib"
    matrix = "tfidf.npy"
    outpath = nnenv.getResourcePath() 
    
    
    # load dict and stopwords
    createDictStop()
    
    # load corpus/
    corpus = combineTitleAndContent(corpus_raw)
    

    # save content_id mapping
    content_id_mapping = corpus[["content_id"]]
    content_id_mapping.index.name = 'index'
    content_id_mapping.to_csv(outpath + nnenv.getItem('content_id_mapping')) 


    # transform corpus to right format
    corpus["corpus"] = corpus["all"].apply(segment)
    
    #create tfidf-matrix and vectorizer
    tfidfMatrix, vectorizer = createTfidfMatrix(corpus)
    
    #save esstenial files
    with open(outpath + vector, 'wb') as f:
        joblib.dump(vectorizer, f)
    
    np.save(outpath + matrix, tfidfMatrix)
    
    print("new tfidf_matrix and vectorizer have been saved into {""}".format(outpath))
Example #3
0
def Dataframefactory(table_name, sep=',', iotype='fs'):
    ##directly return Pandas dataframe
    if iotype == 'fs':
        return (pd.read_csv(nnenv.getResourcePath() +
                            nnenv.getItem(table_name),
                            sep=sep))
    if iotype == 'db':
        return (pd.read_sql_table(table_name=nnenv.getItem(table_name),
                                  con=nnenv.getConnectable()))
    else:
        print('IOtype is only for db or fs')
        raise (Exception)
Example #4
0
def Dataframefactory(table_name,
                     sep=',',
                     iotype='fs',
                     con=nnenv.getConnectable()):
    ##directly return Pandas dataframe
    if iotype == 'fs':
        return (pd.read_csv(nnenv.getResourcePath() +
                            nnenv.getItem(table_name),
                            sep=sep,
                            engine='python'))
    if iotype == 'db':
        return (pd.read_sql_table(table_name=nnenv.getItem(table_name),
                                  con=con))
Example #5
0
def write_mysql_table(data_frame, table_name, con):
    table_name = nnenv.getItem(table_name)
    con = nnenv.getItem(con)
    engine = create_engine(con)
    con = engine.connect()

    engine.execute('truncate table ' + table_name)

    data_frame.to_sql(name=table_name,
                      if_exists='append',
                      con=engine,
                      index=False)
    con.close()
    return ('sucessful insert mysql table')
Example #6
0
    def loading_everything():

        global tag, similar, mapping, clf, tfidf_matrix, labeled_corpus, title_list, content_id_mapping
        createDictStop()
        tag = nn.Dataframefactory('tag', iotype='fs')
        similar = nn.Dataframefactory('similar', iotype='fs')
        mapping = mappingCbind(similar, tag)

        clf = nn.Joblibfactory(nnenv.getItem('vectorizer'))
        tfidf_matrix = nn.Numpyarrayfactory(nnenv.getItem('tfidf'))

        labeled_corpus = nn.Dataframefactory('labeledContent',
                                             sep='|',
                                             iotype='fs',
                                             con=nnenv.getItem('mysql_url'))
        title_list = labeled_corpus.title.tolist()

        content_id_mapping = nn.Dataframefactory('content_id_mapping',
                                                 iotype='fs')
Example #7
0
def write_table(data_frame, table_name, iotype='fs'):
    if iotype == 'db':
        ##get sqlalchey dataengine
        from sqlalchemy import create_engine
        ##initlize data engine
        engine = create_engine(nnenv.getConnectable())
        ##write data frame to csv tmp file
        path_tmp_file = nnenv.getValue('tmp_dir') + '/' + table_name
        data_frame.to_csv(path_tmp_file, index=False, header=False)
        ##connect database
        conn = engine.connect()
        ##initilize hive_sql
        hive_sql_ = 'LOAD DATA LOCAL INPATH \'' + path_tmp_file + '\' OVERWRITE INTO TABLE ' + nnenv.getItem(
            table_name)

        ##execute hive_sql
        result = conn.execute(hive_sql_)
        result.close()

    else:
        raise (ValueError)
Example #8
0
import pandas as pd
from datetime import datetime
import nnenv
import nndw
##nndw.write_mysql_table(df,table_name='iqvia_4pe_hcp_recommendation_new',con=nnenv.getItem('mysql_con'))

from sqlalchemy import create_engine

engine = create_engine(nnenv.getItem('mysql_con'))

engine.connect()

for item in engine.execute(
        'select count(1) from iqvia_4pe_hcp_recommendation_new'):
    print(item)

for item in engine.execute('desc iqvia_4pe_hcp_recommendation_new'):
    print(item)

for item in engine.execute('show index from iqvia_4pe_hcp_recommendation_new'):
    print(item)

from sqlalchemy import create_engine
from sqlalchemy.engine import reflection
insp = reflection.Inspector.from_engine(engine)
for name in insp.get_table_names():
    for index in insp.get_indexes(name):
        print(index)

t1 = datetime.now()