Esempi in Python per preprocessing, esempi in Python per preprocess.preprocessing

Esempio n. 1

0

Mostra file

File: pipeline.py Progetto: douglascook/bio_relex

def run_de_ting(start):
    """
    Run the pipeline so far starting at given stage
    """
    if start < 1 or start > 4:
        print 'Only stages between 1 and 4 exist!'
        return 0

    # take required inputs
    if start <= 2:
        no_orgs = input(
            'Should organisations be excluded (1 for yes, 0 for no)? ')

    if start <= 4:
        file_name = raw_input(
            'Enter a name for your WEKA file, .arff will be appended. ')

    # run de ting!
    if start == 1:
        preprocess.preprocessing()
        print 'STAGE ONE FINISHED'

    if start <= 2:
        entity_extraction.extract_all_entities(no_orgs)
        print 'STAGE TWO FINISHED'

    if start <= 3:
        feature_extraction.generate_feature_vectors()
        print 'STAGE THREE FINISHED'

    if start <= 4:
        weka.write_file(file_name)
        print 'STAGE FOUR FINISHED'

Esempio n. 2

0

Mostra file

def load_img(training_sample_size=1000,
             test_sample_size=200,
             x_size=18,
             y_size=5,
             all=False):
    #load dataset
    dataset = dict()
    dataset['labels'] = [
        line.rstrip('\n') for line in open('./dataset/labels.txt')
    ]

    new_image = cv2.imread('./dataset/%d.png' % 0)
    # Preprocessing :
    new_image = preprocessing(new_image)
    images = np.array([new_image])

    for i in range(1, training_sample_size + test_sample_size):
        new_image = cv2.imread('./dataset/%d.png' % i)
        #Preprocessing :
        new_image = preprocessing(new_image)
        images = np.insert(images, i, [new_image], 0)

    if (all):
        return images, dataset['labels'][:training_sample_size +
                                         test_sample_size]

#test and training set
    X_train_orginal = images[:training_sample_size]
    y_train = np.squeeze(dataset['labels'][:training_sample_size])
    X_test_original = images[training_sample_size:training_sample_size +
                             test_sample_size]
    y_test = np.squeeze(
        dataset['labels'][training_sample_size:training_sample_size +
                          test_sample_size])

    #resize
    X_train_5by5 = [
        cv2.resize(img, dsize=(x_size, y_size)) for img in X_train_orginal
    ]
    X_test_5by_5 = [
        cv2.resize(img, dsize=(x_size, y_size)) for img in X_test_original
    ]
    #reshape
    X_train = [x.reshape(x_size * y_size) for x in X_train_5by5]
    X_test = [x.reshape(x_size * y_size) for x in X_test_5by_5]

    #return
    return X_train, y_train, X_test, y_test

Esempio n. 3

0

Mostra file

File: data_import.py Progetto: dss875914213/Action-detection-using-Kinect

    def dataImport(self):
        '''
        output: skeleton,label
        '''
        from preprocess import preprocessing

        all_skeleton = np.array([])
        all_label = np.array([])

        for i in range(7):
            for j in range(19):
                try:
                    # ====pay attention to this address, every time move python file, remember to change it====#
                    rawData = np.loadtxt('LSTM_Train/data/{}/{}.txt'.format(
                        i, j))
                except:
                    pass
                else:
                    _pre = preprocessing(pos=rawData)
                    _skeleton = _pre.run()
                    # print("_data size is {}".format(_data.shape))
                    skeleton, label = self.add2List(_skeleton, i)
                    all_skeleton = np.append(all_skeleton, skeleton)
                    all_label = np.append(all_label, label)
                    # print("allData size is {}".format(allData.shape))

        all_label = np.reshape(all_label, [-1, self.size])
        all_skeleton = np.reshape(all_skeleton,
                                  [-1, self.n_steps, self.joints * 3])
        return all_skeleton, all_label

Esempio n. 4

0

Mostra file

def predict():
    check_news = str(request.form['check_news'])
    print(check_news)
    final = pp.preprocessing(check_news)
    # lst = check_news.split()
    # for j in range(len(lst)):
    #     if lst[j] == 'U.S.':
    #         lst[j] = "USA"
    #
    # check_news = " ".join(map(str, lst))
    #
    # final_check = re.sub('[^a-zA-Z]', ' ', check_news)
    # final_check = final_check.lower()
    # final_check = final_check.split()
    # final_check = [lemmatizer.lemmatize(word) for word in final_check if not word in set(stopwords.words('english'))]
    # final_check = ' '.join(final_check)
    # voc_size = 10000
    # final_onehot = one_hot(final_check, voc_size)
    #
    # final_onehot = np.array(final_onehot)
    # final_onehot = final_onehot.reshape((1, len(final_onehot)))
    # final_onehot = pad_sequences(final_onehot, padding='pre', maxlen=20)

    ans = model1.predict(final)
    if (np.round(ans) == 0):
        output = "News is true"
    else:
        output = "News is fake"

    return render_template("index.html", prediction_text=output)

Esempio n. 5

0

Mostra file

def reconst(imgPath, maskPath, moving, templatePath, preFlag):

    if preFlag:
        imgPath, maskPath = preprocessing(imgPath, maskPath)

    img = load(imgPath)

    directory = dirname(imgPath)
    inPrefix = imgPath.split('.nii')[0]
    prefix = basename(inPrefix)
    outPrefix = os.path.join(directory, 'harm', prefix)
    b0, shm_coeff, qb_model = rish(imgPath, maskPath, inPrefix, outPrefix, N_shm)


    print(f'Registering template FA to {imgPath} space ...')
    outPrefix = pjoin(directory, 'harm', 'ToSubjectSpace_' + prefix.replace(f'_b{bshell_b}',''))


    # glob the directory for _b{bmax} transform files
    # if not glob(outPrefix.replace(f'_b{bshell_b}','*')+'1Warp.nii.gz'):
    if not isfile(outPrefix+'1Warp.nii.gz'):
        fixed = pjoin(directory, 'dti', f'{prefix}_FA.nii.gz')
        antsReg(fixed, maskPath, moving, outPrefix)

    antsApply(templatePath, pjoin(directory, 'harm'), prefix)

    print(f'Reconstructing signal from {imgPath} rish features ...')
    harmImg, harmMask = ring_masking(directory, prefix, maskPath, shm_coeff, b0, qb_model, img.header)
    copyfile(inPrefix + '.bvec', harmImg.split('.nii')[0] + '.bvec')
    copyfile(inPrefix + '.bval', harmImg.split('.nii')[0] + '.bval')

    if debug:
        dti_harm(harmImg, harmMask)

    return (imgPath, maskPath, harmImg, harmMask)

Esempio n. 6

0

Mostra file

File: main.py Progetto: ssgalitsky/NLP

def main():
    document_to_process = []
    for doc_name in document_to_process:
        doc = preprocessing(doc_name)
        keywords = compute_tfidf(get_corpus(), doc, KEYWORD_AMOUNT)
        sentiment = get_sentiment(doc)
        print("\nAbstract:")
        print(doc)
        print("\nSentiment:")
        print(sentiment)
        print("\nKeywords:")
        for k in keywords:
            print(k, keywords[k])
        print("==================")
    return

Esempio n. 7

0

Mostra file

File: main.py Progetto: ngocphuong1809/underthesea

def text_classification():
    if request.method == 'POST':
        text = request.form[
            'text_class']  ##get input data from form user types
        t = text
        max_review_length = 3000  ##input data with max length is 3000 words
        print(text + "\n")
        pre = preprocessing()
        start = time.time()
        ##data preprocessing such as remove_html, puntuation,...
        document = pre.text_preprocess(text)
        end2 = time.time()
        ## total time taken
        print(f"Runtime of text preprocessing is {end2 - start}")
        ###Remove stopwords of input data preprocessed
        document = pre.remove_stopwords(document)
        end3 = time.time()
        ## total time taken
        print(f"Runtime of removing stopwords is {end3 - start}")
        ##Text prediction such the_thao, thoi_su,...
        label = nb_model.predict([document])
        end4 = time.time()
        # total time taken
        print(f"Runtime of prediction is {end4 - start}")
        ##inversion_transform label from index to class
        class1 = label_encoder.inverse_transform(label)
        end5 = time.time()
        ## total time taken
        print(f"Runtime of inverse_transform is {end5 - start}")
        print(f"Predict label: {class1}")

        ##write data into csv file
        with open("output/output_text_classification.csv",
                  'a',
                  newline='',
                  encoding="utf8",
                  errors="ignore") as out:
            filenames = ['class', 'content']
            writer = csv.DictWriter(out, filenames)
            writer.writeheader()
            writer.writerow({'class': class1, 'content': text.strip("\n")})
        end6 = time.time()
        # total time taken
        print(f"Runtime of saving into cs file is {end6 - start}")
    return render_template('home.html', classification=class1)

Esempio n. 8

0

Mostra file

def perform_ocr(raw_image, model_dict):

    # Borro todo el contenido de la carpeta de output
    fileList = os.listdir(
        os.path.dirname(os.path.abspath(__file__)) + "/output_images")
    for fileName in fileList:
        os.remove(
            os.path.dirname(os.path.abspath(__file__)) + "/output_images/" +
            fileName)

    # Redimensionamos la region de interes seleccionada a 100 px de altura. La anchura dependerá de la resolución original.
    height = raw_image.shape[0]
    width = raw_image.shape[1]
    aspectRatio = width / (height * 1.0)
    height = 100
    width = int(height * aspectRatio)
    raw_image = cv2.resize(raw_image, (width, height))

    # Aplico un procesamiento a la imagen
    preprocessed_image = preprocessing(raw_image)

    # Segmento y redimensiono los dígitos para mantener la relación de aspecto
    all_digits = segmentation(preprocessed_image)
    all_results = ["" for _ in range(0, len(model_dict))]
    best_result = ""
    for digit in all_digits:
        current_predicted_digit = []
        '''
        plt.imshow(digit)
        plt.show()
        '''
        for key, value in model_dict.items():
            predicted = predict_image(digit, value['model'], value['graph'],
                                      value['session'])
            all_results[key] += str(predicted)
            current_predicted_digit.append(str(predicted))
        # De todas las predicciones, supongo que el digito correcto es el que más veces se predijo
        best_result += get_best_digit(current_predicted_digit)
    print(all_results)
    print(best_result)
    return best_result

Esempio n. 9

0

Mostra file

File: run_preprocess.py Progetto: amitashnanda/Plant-Sustainability-Enhancement-Using-UAV

import cv2
import os
import preprocess

# root folder
root_path = "raw_leaf_data/"

all_image_files = preprocess.get_all_files(root_path)

for f, idx in zip(all_image_files, range(len(all_image_files))):
    t = preprocess.preprocessing(f)
    target_class = f.split('\\')[0].split('/')[1]
    print(target_class)
    cv2.imwrite(
        os.path.join('new_leaf_data', target_class, "leaf-image" + str(idx)) +
        '.jpg', t)
    print(
        'saved',
        os.path.join('new_leaf_data', target_class, "leaf-image" + str(idx)) +
        '.jpg')

Esempio n. 10

0

Mostra file

def multi_process(input_dir):
    for i in glob.glob(os.path.join(input_dir, '*')):
        print(i)
        preprocessing(i)

Esempio n. 11

0

Mostra file

                                              use_normalized_coordinates=True,
                                              line_thickness=5,
                                              min_score_thresh=0.50)

    #if any objects are detected, generate the temp file
    if (coordinates):
        generate_temp(coordinates)

    dim = (950, 1000)
    resized = cv2.resize(image_np, dim, interpolation=cv2.INTER_AREA)
    cv2.imshow('output_image', resized)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    #the image has been processed
    #delete the temporary processes file if exists
    path = '/home/mohak/Music/implementation/'
    os.remove(path + 'temp.jpg')
    #run the generated html code in browser as a webpage
    #give the complete path to the generated html file
    file_path = 'PATH_TO_DIR/generated_code.html'
    new = 2
    webbrowser.open(file_path, new=new)


#For running on local testImages
#path of the image using which the html code is to be generated
path = 'COMPLETE_PATH_TO_INPUT_IMAGE'

processed_image = preprocessing(path)
processImage(processed_image)

Esempio n. 12

0

Mostra file

File: preprocess_chapters.py Progetto: JamesBrowns/data-analysis

import preprocess
import os

chapter_folder = "chapters"  # Folder to save result
chapter_split_mark = "$"  # Split mark to mark the end of chapter

# Create result folder
if not os.path.exists(chapter_folder):
    os.makedirs(chapter_folder)

# Split chapters
string = preprocess.input_file.read()
string = preprocess.str_replace_re(string, "正文 第.{1,5}回", chapter_split_mark)
chapters = string.split(chapter_split_mark)

# Save chapters
for chapter_no, chapter_string in enumerate(chapters):
    if chapter_no == 0:
        continue

    result = preprocess.preprocessing(chapter_string)

    file_name = os.path.join(chapter_folder, "%d.txt" % chapter_no)
    chapter_file = open(file_name, "w")
    chapter_file.write(result)

Esempio n. 13

0

Mostra file

File: model.py Progetto: Tanay-Mahajan/FakeNewsDetection

y= x['label']
#removing the column used in prediction
x= x.drop('label',axis=1)

x['content'] = x['title'] + x['text']
x['content']

# preprocessing and cleaning data
# ps = PorterStemmer()

corpus = []

for i in range(len(x['title'])):
    # final =[]
    #final.append(op)
    op = pp.preprocessing(x['title'][i])
    corpus.append(op)
    # lst = x['title'][i].split()
    # for j in range(len(lst)):
    #     if lst[j] == 'U.S.':
    #         lst[j] = "USA"
    #
    # x['title'][i] = " ".join(map(str, lst))
    #
    # review = re.sub('[^a-zA-Z]', ' ', x['title'][i])
    # review = review.lower()
    # review = review.split()
    # review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    # review = ' '.join(review)
    #corpus.append(review)
    print(i, " row completed.")

Esempio n. 14

0

Mostra file

File: ibmModel1.py Progetto: nguyenchithien/Word-Based-Model

def main():
    if len(sys.argv)!= 5:                                                                               #check arguments
        print "Usage :: python ibmModel1.py file_source file_target iterations numberOfSentencesForTraining"
        sys.exit(0)
    
    numberOfSentences=int(sys.argv[4])                                                                  #initialisation        
    numberOfIterations = int(sys.argv[3])
    sentencePair = preprocessing(numberOfSentences, sys.argv[1], sys.argv[2] )

    listOfSourceWords = defaultdict(list)                                                               #create list of possible source words for a target word
    for pair in sentencePair:
        sourceWords=pair.split(' ')
        targetWords=sentencePair[pair].split(' ')
        for word in targetWords:
            if word!='':
                for key in sourceWords:
                    if key!='':
                        listOfSourceWords[word].append(key)
                        
    for word in  listOfSourceWords:
       listOfSourceWords[word] = list(set(listOfSourceWords[word]))
        

    translationProbability=defaultdict(dict)                                                            #initialize the translation probability
    for wordTarget in  listOfSourceWords:
        uniqueWordsSource = listOfSourceWords[wordTarget]
        for wordSource in uniqueWordsSource:
            translationProbability[wordTarget][wordSource]=1/float(len(uniqueWordsSource))
        
        
            
    perplexity=0                                                                                        #Expectation Maximisation
    iteration=0
    while iteration<numberOfIterations:
        iteration+=1 
        print "Iteration: " +str(iteration)
        
        count=defaultdict(lambda:  defaultdict(float))                                                  #initialisation
        sumTotal=defaultdict(float)
        total=defaultdict(float)
            
        for pair in sentencePair:                                                                       #E Step
            sourceWords=pair.split(' ')
            targetWords=sentencePair[pair].split(' ')
            for words in targetWords:
                if words!='':
                    for key in sourceWords:
                        if key!='':
                            sumTotal[words]+=translationProbability[words][key]

            for words in targetWords:
                if words!='':
                    for key in sourceWords:
                        if key!='':
                            count[words][key]+=(translationProbability[words][key]/sumTotal[words])
                            total[key]+=(translationProbability[words][key]/sumTotal[words])
          
        for key in sourceWords:                                                                         #M Step
            if key!='':
                for words in targetWords:
                    if words!='':
                        translationProbability[words][key]=count[words][key]/total[key]

        newperplexity=calculatePerplexity(sentencePair, translationProbability)
        print newperplexity
        if perplexity>=newperplexity:
            print "Successful"
        else:
            print "Failed"
        perplexity=newperplexity

    data=[]
    print len(translationProbability)
    for key in translationProbability:
        possibleWords = translationProbability[key]
        data.append(key+' '+max(possibleWords.iteritems(), key=operator.itemgetter(1))[0])
            
    with open('wordTranslation.txt','w') as f:
        f.write('\n'.join(data))

Esempio n. 15

0

Mostra file

File: cross_section.py Progetto: IDAEA-EVS/Geopropy

def main_function(database_dir,
                  bore_IDs,
                  Lithology_table,
                  box_bottom_rate=1.1,
                  bottomlength=15,
                  predefined_angle_degree=None,
                  Merge_Layers=False,
                  bottom_box_type='normalbottombox',
                  xshifter=0.5,
                  yshifter=0.5,
                  epsbn_ratio=0.05,
                  eps_ratio=0.01,
                  ExtendLine_edit_distance=5,
                  TrimLine_edit_dangle_length=2,
                  Integrate_management_distance=0.01,
                  del_x=10,
                  del_y=10,
                  smooth_2d=False,
                  gen_polygons=True):
    t_total_start = time.time()
    con_to_mdb = pypyodbc.win_connect_mdb(database_dir)
    arcpy.env.workspace = database_dir
    #arcpy.env.outputZFlag = "Enabled"
    ########################################################################################################
    '''Pre processing'''
    print "######### Pre-processing... #########"
    t0 = time.time()
    bore, indextemplist, mainpolylist = preprocess.preprocessing(bore_IDs)
    t1 = time.time()
    print "########## Pre-processing finished #########"
    print "Time:", t1 - t0
    print "\n \n \n"
    ########################################################################################################
    print "######### Reading data... #########"
    t0 = time.time()
    '''reading Borehole_table'''
    boidtable, x, y, elev, indextemplist_with_coords = readinginputdata.readBorehole_table(
        bore, xshifter, yshifter, indextemplist, con_to_mdb)
    '''reading Borehole_Litho table'''
    boid, top, bot, lit, fault_points_in_rawdata = readinginputdata.readBorehole_LithoTable(
        bore, boidtable, elev, Lithology_table, con_to_mdb)
    '''Read and pre - processing priority table '''
    prior = readinginputdata.readpriority_table(con_to_mdb)
    #format = [prior 0:prioroty 1:[toplayer] 2:[bottomlayer] 3:type]
    #top layer and bottom layer type is list'''
    '''reading fault_table'''
    fault_table = readinginputdata.readfault_table(prior, con_to_mdb)
    #fault_table=[ [priority_number,[ [bhid,elevation] , [bhid,elevation] , ...] ]
    #, [priority_number,[ [bhid,elevation] , [bhid,elevation] , ...] , ... ] , ...]
    '''reading surface data'''
    temp_points = readinginputdata.readtopo_points(prior,
                                                   indextemplist_with_coords,
                                                   del_x, del_y, con_to_mdb)
    con_to_mdb.close()
    t1 = time.time()
    print "######### Reading data finished #########"
    print "Time:", t1 - t0
    print "\n \n \n"
    '''merging the layers with the same Lithology and change the raw data structure'''
    if Merge_Layers == True:
        print "######### Merging layers... #########"
    rawdata = layermerger.layermerge(
        boid, top, bot, lit, indextemplist_with_coords, bore,
        Merge_Layers)  #don't do the merge part when it is false
    if Merge_Layers == True:
        print "######### Merging layers finished #########"
    #########################################################################################################
    print "######### 3D plane calculator, Projecting surface points, Generating Topography and guide bottom boundary... #########"
    t0 = time.time()
    '''plane calculator'''
    planenormalslist = plane_calculator_topo_projector.cross_product_and_planenormalslist_maker(
        indextemplist_with_coords)
    '''Projecting surface points'''
    temp_points = plane_calculator_topo_projector.surface_point_projector(
        planenormalslist, temp_points)
    ''' drawing the Box '''
    #ratiobottombox or normalbottombox:
    #normalbottombox find the minimum, and increase the box by a user defined ratio (consider faults)
    #ratiobottombox do it based on minimum of every borehole (consider faults)
    mainpolylist, boxpoints, rawdata = boxcreator.boxcreator2(
        bottom_box_type, box_bottom_rate, fault_table, rawdata, prior,
        mainpolylist, indextemplist_with_coords)
    ''' drawing the topo'''
    mainpolylist, maxfirst, maxlast, temp_points = topolinecreator.topolinecreator(
        mainpolylist, indextemplist_with_coords, prior, temp_points)
    t2 = time.time()
    print "######### 3D plane calculator, Projecting surface points, Generating Topography and guide bottom boundary finished #########"
    print "Time:", t2 - t0
    print "\n \n \n"
    ########################################################################################################
    '''point extraction (normal and fault poin extraction)'''
    print "######### Point extraction... #########"
    t0 = time.time()
    mainpointlist, mainpointlistreverse, point_id = pointextraction.pointextraction(
        rawdata, prior, indextemplist_with_coords, x, y, boidtable,
        fault_table, elev)
    #mainpointlist=[0=point_id, 1=index, 2=bhid, 3=priority, 4=type, 5=coordinates,
    #6=connectedleft point id, 7=connectedright point id, 8=pointcode ]
    #pointcodes:
    #   0 = not connected
    #   1 = just left connected
    #   2 = just right connected
    #   3 = fully connected'''
    t1 = time.time()
    print "######### Point extraction finished #########"
    print "Time:", t1 - t0
    print "\n \n \n"
    #################################################################
    '''topography reading data and processing'''
    print "####topography reading data and processing####"
    t0 = time.time()
    #to read and preprocess the surface data. Also to produce the maintopolist
    mainpointlist, mainpointlistreverse, mainpolylist, maintopolist = surface_pnt_process(
        mainpointlist, mainpointlistreverse, mainpolylist, prior, rawdata,
        temp_points, indextemplist_with_coords)
    t1 = time.time()
    print "####topography reading data and processing finished####"
    print "Time:", t1 - t0
    print "\n \n \n"
    ########################################################################################################
    '''Manual stage'''
    print "######### Critical zones processing... #########"
    t0 = time.time()
    mainpointlist, mainpolylist = foldfaulintrusionsitunationdeterminer.foldfaulintrusionsitunationdeterminers(
        mainpointlist, mainpointlistreverse, mainpolylist, prior,
        indextemplist_with_coords, maintopolist)
    #IMPORTANT: output mainpointlist contains mainpointlistreverse also, the point situation for all points changed back
    t1 = time.time()
    print "######### Critical zones processing finish #########"
    print "Time:", t1 - t0
    print "\n \n \n"
    ########################################################################################################
    '''create definite lines'''
    print "######### Generating automatic definite lines... #########"
    t0 = time.time()
    temres = definitelines.definitelines(prior, indextemplist_with_coords,
                                         mainpointlist, mainpolylist)
    mainpointlist = temres[0]
    mainpolylist = temres[1]
    t1 = time.time()
    print "######### Generating automatic definite lines finished #########"
    print "Time:", t1 - t0
    print "\n \n \n"
    ########################################################################################################
    '''calculate angles'''
    print "######### Angle processing... #########"
    t0 = time.time()
    angles = anglefinder.anglefinder(mainpolylist, prior, fault_table,
                                     predefined_angle_degree)
    #angles= [ [prio_num,type, tan_angle, quantity, [ [index1,index2,startpoint,endpoint] ]   ] ,...]
    t1 = time.time()
    print "######### Angle processing finished #########"
    print "Time:", t1 - t0
    print "\n \n \n"
    ########################################################################################################
    '''create lines in stage 2 '''
    print "######### Automatic structure completion... #########"
    t0 = time.time()
    mainpointlist, mainpolylist = secondstagelinecompleter.secondstagelinecompleter(
        mainpointlist, prior, mainpolylist, rawdata, angles,
        indextemplist_with_coords)
    t1 = time.time()
    print "######### Automatic structure completion finished #########"
    print "Time:", t1 - t0
    print "\n \n \n"
    ########################################################################################################
    '''post bottom box'''
    print "######### Ultimate bottom boundary... #########"
    t0 = time.time()
    intersectionpoints, postbottomboxlist, minsfirst, minslast = postbottombox.postbottombox(
        mainpolylist, mainpointlist, indextemplist_with_coords, bottomlength)
    t1 = time.time()
    print "######### Ultimate bottom boundary finished #########"
    print "Time:", t1 - t0
    print "\n \n \n"
    ########################################################################################################
    '''Create point and polylines in 3d'''
    print "######### Creating 3D point and layers (polylines) database... #########"
    t0 = time.time()
    pfcadd = createpointfeatureclass.createpointfeatureclass(
        boxpoints, mainpointlist, intersectionpoints, postbottomboxlist,
        indextemplist_with_coords[-1][1])
    arcgistempdb = createpolyfeatureclass.createpolyfeatureclass(
        mainpolylist, pfcadd, postbottomboxlist, minsfirst, minslast, maxfirst,
        maxlast, prior)
    t1 = time.time()
    print "######### Creating 3D point and layers (polylines) database finished #########"
    print "Time:", t1 - t0
    print "\n \n \n"
    #This is to stop the algorithm in case the user doesn't want the polygons
    if gen_polygons == False:
        print "'gen_polygons' variable is False. In case of need to 3D polygons and/or 2D lines and polygons, set 'gen_polygons' to True"
        sys.exit()
    ########################################################################################################
    '''This function make the 3d polygons between boreholes individually, then merge them'''
    print "######### 3D to 2D Convertion, polygon determiner, 2D cross-section maker, 3D polygon database generator... #########"
    t0 = time.time()
    #epsbn_ratio=0.05
    #eps_ratio=0.01
    arcgistempdb_2d, polygns_3d = threeD_to_2d_projector_v2.cs_3d_to_2d(
        planenormalslist, mainpolylist, indextemplist_with_coords,
        postbottomboxlist, minsfirst, minslast, maxfirst, maxlast, prior,
        rawdata, epsbn_ratio, eps_ratio, ExtendLine_edit_distance,
        TrimLine_edit_dangle_length, Integrate_management_distance, smooth_2d)
    t1 = time.time()
    print "######### 3D to 2D Convertion, polygon determiner, 2D cross-section maker, 3D polygon database generator finished #########"
    print "Time:", t1 - t0
    print "\n \n \n"
    ########################################################################################################
    print "Total time:"
    t_total_end = time.time()
    print t_total_end - t_total_start
    print "\n \n \n"
    ########################################################################################################
    #return pfcadd,arcgistempdb, mainpolylist,mainpointlist,indextemplist,prior,fault_table,rawdata,angles,arcgistempdb_2d
    #return arcgistempdb,arcgistempdb_2d, mainpolylist,mainpointlist,indextemplist,prior,fault_table,rawdata,angles
    #return pfcadd,arcgistempdb, mainpolylist,mainpointlist,indextemplist,prior,fault_table,rawdata
    return

Esempio n. 16

0

Mostra file

File: algo.py Progetto: sahitesh/Luminate

def algo(exp_data, attr_data_, map_data, pop, n, prop1, prop2, propensity1,
         propensity2, sessionId, userId, cols):
    try:
        start_time = time.time()
        status = 0
        is_opened = 0
        print "Started"

        if int(exp_data['Market_flg'].unique()[0]) == 0:
            client = 'client' + str(exp_data['ClientNum'].unique()[0])
            split_col_flg = 'SPLIT_FLG'
            market_flg = 0
        else:
            split_col_flg = 'SPLIT_FLG_NATIONAL_FILE'
            market_flg = 1

        attr_data = attr_data_[attr_data_['EXPERIAN_DB_Col_NAME'].isin(
            list(exp_data.columns))]
        req_cols = list(attr_data['EXPERIAN_DB_Col_NAME'].values)
        cat_col = list(attr_data[attr_data.loc[:, 'CLASS_DEFN'] == 'factor']
                       ['EXPERIAN_DB_Col_NAME'].values)
        num_col = list(attr_data[attr_data.loc[:, 'CLASS_DEFN'] == 'numeric']
                       ['EXPERIAN_DB_Col_NAME'].values)
        int_col = list(attr_data[attr_data.loc[:, 'CLASS_DEFN'] == 'integer']
                       ['EXPERIAN_DB_Col_NAME'].values)
        bin_col = list(attr_data[attr_data.loc[:, 'BINNING_FLG'] == 'Yes']
                       ['EXPERIAN_DB_Col_NAME'].values)
        split_data = attr_data[attr_data.loc[:, split_col_flg] != 'No'][[
            'EXPERIAN_DB_Col_NAME', split_col_flg
        ]]
        print "attriutes loaded"
        print("--- %s seconds ---" % (time.time() - start_time))
        start_time = time.time()

        need_attr = attr_data[attr_data['VAR_NAME'].isin(list(cols))]
        cols = list(need_attr['EXPERIAN_DB_Col_NAME'].values)
        print "needed attributes loaded"
        print("--- %s seconds ---" % (time.time() - start_time))
        start_time = time.time()

        #clean for modeling
        exp_data_ = exp_data[req_cols]
        con_col = num_col + int_col
        exp_data_clean_ = clean(exp_data_, con_col)
        print "data cleaned"
        print("--- %s seconds ---" % (time.time() - start_time))
        start_time = time.time()

        if market_flg == 0:
            #scoring propensity
            #dc = joblib.load("/home/ec2-user/softwares/VerbatimModule_RCLCO/model_imp.pkl" )
            dc = joblib.load("model_imp.pkl")
            test1 = exp_data_clean_.copy()
            test2 = pd.get_dummies(test1, columns=cat_col)

            prop1_scores = 0
            prop1_flg = 0
            prop2_scores = 0
            prop2_flg = 0
            if client in dc:
                models_imp = dc[client]
                if prop1 in models_imp:
                    fi_prop1 = models_imp[prop1]
                    #models = joblib.load("/home/ec2-user/softwares/VerbatimModule_RCLCO/"+client+"_"+prop1+".pkl")
                    models = joblib.load(client + "_" + prop1 + ".pkl")
                    model1 = models[0]
                    fi = models[1]
                    feats1 = [i[0] for i in fi]
                    ls2 = list(set(feats1) - set(test2.columns))
                    ls1 = list(test2.columns) + ls2
                    test3 = test2.reindex(columns=ls1, fill_value=0)
                    X1 = test3[feats1]
                    prop1_scores = (model1.predict_proba(X1).T)[1]
                    prop1_flg = 1

                if prop2 in models_imp:
                    fi_prop2 = models_imp[prop2]
                    #models = joblib.load("/home/ec2-user/softwares/VerbatimModule_RCLCO/"+client+"_"+prop2+".pkl")
                    models = joblib.load(client + "_" + prop2 + ".pkl")
                    model2 = models[0]
                    fi = models[1]
                    feats2 = [i[0] for i in fi]
                    ls2 = list(set(feats2) - set(test2.columns))
                    ls1 = list(test2.columns) + ls2
                    test3 = test2.reindex(columns=ls1, fill_value=0)
                    X1 = test3[feats2]
                    prop2_scores = (model2.predict_proba(X1).T)[1]
                    prop2_flg = 1
                print "Scored propensity"

        print("--- %s seconds ---" % (time.time() - start_time))
        start_time = time.time()

        exp_data_clean = exp_data_clean_[req_cols]

        exp_data_proc, cat_col_new = preprocessing(exp_data_clean, cat_col,
                                                   num_col, bin_col,
                                                   split_data, cols,
                                                   split_col_flg)
        print "data preprocessed"
        print("--- %s seconds ---" % (time.time() - start_time))
        start_time = time.time()

        clus_labels = kmeans(exp_data_proc, n, cat_col_new)
        exp_data_clean['cluster'] = clus_labels
        print "segmented"
        print("--- %s seconds ---" % (time.time() - start_time))
        start_time = time.time()

        cat_cols = cat_col + int_col

        d = {
            0: 'A',
            1: 'B',
            2: 'C',
            3: 'D',
            4: 'E',
            5: 'F',
            6: 'G',
            7: 'H',
            8: 'I',
            9: 'J',
            10: 'K',
            11: 'L',
            12: 'M',
            13: 'N',
            14: 'O',
            15: 'P'
        }
        for i in range(0, n):
            exp_data_clean['cluster'].replace(i, d[i], inplace=True)

        profiling_data1, final_data1 = profile(exp_data_clean, req_cols,
                                               cat_cols, bin_col, n, 'cluster')
        print "profiled"
        print("--- %s seconds ---" % (time.time() - start_time))
        start_time = time.time()

        exp_data_clean['prop1_scores'] = 0
        exp_data_clean['prop2_scores'] = 0
        if market_flg == 0:
            if prop1_flg == 1:
                exp_data_clean['prop1_scores'] = prop1_scores
                exp_data_clean['prop1_seg'] = 1
                exp_data_clean.loc[exp_data_clean['prop1_scores'] < .25,
                                   'prop1_seg'] = 1
                exp_data_clean.loc[(exp_data_clean['prop1_scores'] >= .25) &
                                   (exp_data_clean['prop1_scores'] < .7),
                                   'prop1_seg'] = 2
                exp_data_clean.loc[exp_data_clean['prop1_scores'] >= .7,
                                   'prop1_seg'] = 3
                exp_data_clean['prop1_seg'] = exp_data_clean['cluster'].astype(
                    str) + exp_data_clean['prop1_seg'].astype(str)
                profiling_data_1, final_data_1 = profile(
                    exp_data_clean, req_cols, cat_cols, bin_col, n,
                    'prop1_seg')

            if prop2_flg == 1:
                exp_data_clean['prop2_scores'] = prop2_scores
                exp_data_clean['prop2_seg'] = 1
                exp_data_clean.loc[exp_data_clean['prop2_scores'] < .25,
                                   'prop2_seg'] = 1
                exp_data_clean.loc[(exp_data_clean['prop2_scores'] >= .25) &
                                   (exp_data_clean['prop2_scores'] < .7),
                                   'prop2_seg'] = 2
                exp_data_clean.loc[exp_data_clean['prop2_scores'] >= .7,
                                   'prop2_seg'] = 3
                exp_data_clean['prop2_seg'] = exp_data_clean['cluster'].astype(
                    str) + exp_data_clean['prop2_seg'].astype(str)
                profiling_data_2, final_data_2 = profile(
                    exp_data_clean, req_cols, cat_cols, bin_col, n,
                    'prop2_seg')
        print "profiled again"
        print("--- %s seconds ---" % (time.time() - start_time))
        start_time = time.time()

        temp = exp_data_clean.copy()

        prop_seg = dict(temp['cluster'].value_counts())
        if market_flg == 0:
            if prop1_flg == 1:
                prop1_seg = dict(temp['prop1_seg'].value_counts())
            if prop2_flg == 1:
                prop2_seg = dict(temp['prop2_seg'].value_counts())
        print "start the dump"
        print("--- %s seconds ---" % (time.time() - start_time))
        start_time = time.time()

        #------------------------------sql dump--------------------------------------------------
        con = mysql.connector.connect(
            user='******',
            password='******',
            host='rclco.ctcznzw1aqdz.us-west-1.rds.amazonaws.com',
            port='3306',
            database='rclco_bi2i')
        #con= mysql.connector.connect(user='******', password='******',host='localhost',port='3306',database='rclco')
        cursor = con.cursor(True)
        is_opened = 1

        for i in range(0, n):
            seg = d[i]
            value = prop_seg[seg]
            sql = "INSERT INTO SEGMENT_DONUT_TMP(sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,NO_HH) VALUES('{}','{}',{},'{}','{}',{});".format(
                sessionId, userId, i + 1, pop, seg, value)
            cursor.execute(sql)

        if market_flg == 0:
            for i in range(0, n):
                for j in range(0, 3):
                    seg1 = str(d[i]) + str(j + 1)
                    if prop1_flg == 1:
                        if seg1 in prop1_seg:
                            value = prop1_seg[seg1]
                            sql = "INSERT INTO SUB_SEG_DONUT_TMP(sessionId,userId,SEG_ID,SEGMENT_TYPE,PROPENSITY_TYPE,NO_HH,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}',{},'{}');".format(
                                sessionId, userId, i + 1, seg1, propensity1,
                                value, pop)
                            #sqls1.append(sql)
                            cursor.execute(sql)
                    if prop2_flg == 1:
                        if seg1 in prop2_seg:
                            value1 = prop2_seg[seg1]
                            sql = "INSERT INTO SUB_SEG_DONUT_TMP(sessionId,userId,SEG_ID,SEGMENT_TYPE,PROPENSITY_TYPE,NO_HH,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}',{},'{}');".format(
                                sessionId, userId, i + 1, seg1, propensity2,
                                value1, pop)
                            #sqls2.append(sql)
                            cursor.execute(sql)

        #mapping needs to be done for cat_cols
        map_cols = map_data['EXP_COL_NAME'].unique()
        temp1 = final_data1.T.to_dict(orient='list')
        for i in temp1:
            seg_id = (d.keys()[d.values().index(temp1[i][0])]) + 1
            seg = temp1[i][0]
            attr_name = temp1[i][2]
            attr_value = str(temp1[i][3])
            #if temp1[i][5] == 'cat_cols':
            if attr_name in map_cols:
                x = map_data[map_data['EXP_COL_NAME'] == attr_name]
                try:
                    attr_value = x[x['RANGE_VALUE'] ==
                                   attr_value]['DISPLAY_NAME'].values[0]
                except:
                    try:
                        attr_value = x[x['RANGE_VALUE'].map(float) == float(
                            attr_value)]['DISPLAY_NAME'].values[0]
                    except:
                        try:
                            attr_value = x[(x['RANGE_VALUE'].map(
                                float) <= float(attr_value)) & (
                                    x['MAX'].map(float) >= float(attr_value)
                                )]['DISPLAY_NAME'].values[0]
                        except:
                            attr_value = attr_value
            #else:
            #   attr_value = "{0:.2f}".format(float(attr_value))
            attr_name = attr_data[attr_data['EXPERIAN_DB_Col_NAME'] ==
                                  attr_name]['VAR_NAME'].values[0]
            sql = "INSERT INTO SEGMENT_DEFN_TMP(sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,ATTRIBUTE_NAME,ATTRIBUTE_VALUE) VALUES('{}','{}',{},'{}','{}','{}','{}');".format(
                sessionId, userId, seg_id, pop, seg, attr_name, attr_value)
            cursor.execute(sql)

        if market_flg == 0:
            if prop1_flg == 1:
                temp2 = final_data_1.T.to_dict(orient='list')
                for i in temp2:
                    seg_id = (d.keys()[d.values().index(temp2[i][0][0])]) + 1
                    sub_seg = temp2[i][0]
                    attr_name = temp2[i][2]
                    attr_value = str(temp2[i][3])
                    #if temp2[i][5] == 'cat_cols':
                    if attr_name in map_cols:
                        x = map_data[map_data['EXP_COL_NAME'] == attr_name]
                        try:
                            attr_value = x[x['RANGE_VALUE'] == attr_value][
                                'DISPLAY_NAME'].values[0]
                        except:
                            try:
                                attr_value = x[x['RANGE_VALUE'].map(
                                    float) == float(
                                        attr_value)]['DISPLAY_NAME'].values[0]
                            except:
                                try:
                                    attr_value = x[(x['RANGE_VALUE'].map(
                                        float) <= float(attr_value)) & (
                                            x['MAX'].map(float) >= float(
                                                attr_value)
                                        )]['DISPLAY_NAME'].values[0]
                                except:
                                    attr_value = attr_value
                    #else:
                    #   attr_value = "{0:.2f}".format(float(attr_value))
                    attr_name = attr_data[attr_data['EXPERIAN_DB_Col_NAME'] ==
                                          attr_name]['VAR_NAME'].values[0]
                    sql = "INSERT INTO SUB_SEG_DEFN_TMP(sessionId,userId,SEG_ID,SUB_SEG_TYPE,PROPENSITY_TYPE,ATTRIBUTE_NAME,ATTRIBUTE_VALUE,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}','{}','{}','{}');".format(
                        sessionId, userId, seg_id, sub_seg, propensity1,
                        attr_name, attr_value, pop)
                    cursor.execute(sql)

            if prop2_flg == 1:
                temp3 = final_data_2.T.to_dict(orient='list')
                for i in temp3:
                    seg_id = (d.keys()[d.values().index(temp3[i][0][0])]) + 1
                    sub_seg = temp3[i][0]
                    attr_name = temp3[i][2]
                    attr_value = str(temp3[i][3])
                    #if temp3[i][5] == 'cat_cols':
                    if attr_name in map_cols:
                        x = map_data[map_data['EXP_COL_NAME'] == attr_name]
                        try:
                            attr_value = x[x['RANGE_VALUE'] == attr_value][
                                'DISPLAY_NAME'].values[0]
                        except:
                            try:
                                attr_value = x[x['RANGE_VALUE'].map(
                                    float) == float(
                                        attr_value)]['DISPLAY_NAME'].values[0]
                            except:
                                try:
                                    attr_value = x[(x['RANGE_VALUE'].map(
                                        float) <= float(attr_value)) & (
                                            x['MAX'].map(float) >= float(
                                                attr_value)
                                        )]['DISPLAY_NAME'].values[0]
                                except:
                                    attr_value = attr_value
                    #else:
                    #   attr_value = "{0:.2f}".format(float(attr_value))
                    attr_name = attr_data[attr_data['EXPERIAN_DB_Col_NAME'] ==
                                          attr_name]['VAR_NAME'].values[0]
                    sql = "INSERT INTO SUB_SEG_DEFN_TMP(sessionId,userId,SEG_ID,SUB_SEG_TYPE,PROPENSITY_TYPE,ATTRIBUTE_NAME,ATTRIBUTE_VALUE,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}','{}','{}','{}');".format(
                        sessionId, userId, seg_id, sub_seg, propensity2,
                        attr_name, attr_value, pop)
                    cursor.execute(sql)

            for i in range(0, 4):
                if prop1_flg == 1:
                    attr_name = attr_data[attr_data['EXPERIAN_DB_Col_NAME'] ==
                                          fi_prop1[i][0]]['VAR_NAME'].values[0]
                    sql = "INSERT INTO ATRI_CORELATION_TMP(sessionId,userId,PROPENSITY_TYPE,ATTRIBUTE_NAME,CORERLATION,POPULATION_TYPE) VALUES('{}','{}','{}','{}',{},'{}');".format(
                        sessionId, userId, propensity1, attr_name,
                        round(fi_prop1[i][1], 3), pop)
                    cursor.execute(sql)
                if prop2_flg == 1:
                    attr_name = attr_data[attr_data['EXPERIAN_DB_Col_NAME'] ==
                                          fi_prop2[i][0]]['VAR_NAME'].values[0]
                    sql = "INSERT INTO ATRI_CORELATION_TMP(sessionId,userId,PROPENSITY_TYPE,ATTRIBUTE_NAME,CORERLATION,POPULATION_TYPE) VALUES('{}','{}','{}','{}',{},'{}');".format(
                        sessionId, userId, propensity2, attr_name,
                        round(fi_prop2[i][1], 3), pop)
                    cursor.execute(sql)

            for i in range(0, n):
                seg = d[i]
                if prop1_flg == 1:
                    p1 = temp[temp['cluster'] == seg]['prop1_scores'].mean()
                    if prop2_flg == 1:
                        p2 = temp[temp['cluster'] ==
                                  seg]['prop2_scores'].mean()
                        sql = "INSERT INTO SEGMENT_PROPENSITY_TMP(sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,{},{}) VALUES('{}','{}',{},'{}','{}',{},{});".format(
                            'PROP_' + propensity1, 'PROP_' + propensity2,
                            sessionId, userId, i + 1, pop, seg, p1, p2)
                    else:
                        sql = "INSERT INTO SEGMENT_PROPENSITY_TMP(sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,{},{}) VALUES('{}','{}',{},'{}','{}',{},NULL);".format(
                            'PROP_' + propensity1, 'PROP_' + propensity2,
                            sessionId, userId, i + 1, pop, seg, p1)
                elif prop2_flg == 1:
                    p2 = temp[temp['cluster'] == seg]['prop2_scores'].mean()
                    sql = "INSERT INTO SEGMENT_PROPENSITY_TMP(sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,{},{}) VALUES('{}','{}',{},'{}','{}',NULL,{});".format(
                        'PROP_' + propensity1, 'PROP_' + propensity2,
                        sessionId, userId, i + 1, pop, seg, p2)
                else:
                    sql = "INSERT INTO SEGMENT_PROPENSITY_TMP(sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,{},{}) VALUES('{}','{}',{},'{}','{}',NULL,NULL);".format(
                        'PROP_' + propensity1, 'PROP_' + propensity2,
                        sessionId, userId, i + 1, pop, seg)

                cursor.execute(sql)
                for j in range(0, 3):
                    seg1 = str(d[i]) + str(j + 1)
                    if prop1_flg == 1:
                        if seg1 in prop1_seg:
                            p1 = temp[temp['prop1_seg'] ==
                                      seg1]['prop1_scores'].mean()
                            sql = "INSERT INTO SUB_SEG_PROPENSITY_TMP(sessionId,userId,SEG_ID,PROPENSITY_TYPE,SEGMENT_TYPE,PROP_VALUE,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}',{},'{}');".format(
                                sessionId, userId, i + 1, propensity1, seg1,
                                p1, pop)
                            cursor.execute(sql)
                    if prop2_flg == 1:
                        if seg1 in prop2_seg:
                            p2 = temp[temp['prop2_seg'] ==
                                      seg1]['prop2_scores'].mean()
                            sql = "INSERT INTO SUB_SEG_PROPENSITY_TMP(sessionId,userId,SEG_ID,PROPENSITY_TYPE,SEGMENT_TYPE,PROP_VALUE,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}',{},'{}');".format(
                                sessionId, userId, i + 1, propensity2, seg1,
                                p2, pop)
                            cursor.execute(sql)

        need_cols1 = final_data1['variable'].unique()
        for i in range(0, n):
            seg = d[i]
            attr = []
            for j in need_cols1:
                value1 = profiling_data1[
                    (profiling_data1['cluster'] == seg)
                    & (profiling_data1['variable'] == j)]['score']
                if len(value1) == 0:
                    value = 0
                else:
                    value = value1.values[0]
                if np.isnan(value):
                    value = 0
                value = "{0:.2f}".format(float(value))
                attr_name = attr_data[attr_data['EXPERIAN_DB_Col_NAME'] ==
                                      j]['VAR_NAME'].values[0]
                if attr_name not in attr:
                    sql = "INSERT INTO SEGMENT_ATTR_QUAL_TMP (sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,ATTRIBUTE_NAME,ATTRI_QUAL) VALUES('{}','{}',{},'{}','{}','{}',{});".format(
                        sessionId, userId, i + 1, pop, seg, attr_name, value)
                    cursor.execute(sql)
                    attr.append(attr_name)

        if market_flg == 0:
            for i in range(0, n):
                seg = d[i]

                for l in range(0, 3):
                    sub_seg = str(d[i]) + str(l + 1)
                    if prop1_flg == 1:
                        need_cols_1 = final_data_1[
                            final_data_1['cluster'].isin(
                                [d[i] + '1', d[i] + '2',
                                 d[i] + '3'])]['variable'].unique()
                        if sub_seg in prop1_seg:
                            attr = []
                            for j in need_cols_1:
                                value1 = profiling_data_1[
                                    (profiling_data_1['cluster'] == sub_seg)
                                    & (profiling_data_1['variable'] == j
                                       )]['score']
                                if len(value1) == 0:
                                    value = 0
                                else:
                                    value = value1.values[0]
                                if np.isnan(value):
                                    value = 0
                                value = "{0:.2f}".format(float(value))
                                attr_name = attr_data[
                                    attr_data['EXPERIAN_DB_Col_NAME'] ==
                                    j]['VAR_NAME'].values[0]
                                if attr_name not in attr:
                                    sql = "INSERT INTO SUB_SEG_ATTR_QUAL_TMP (sessionId,userId,SEG_ID,SEGMENT_TYPE,PROPENSITY_TYPE,ATTRIBUTE_NAME,ATTRI_QUAL,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}','{}',{},'{}');".format(
                                        sessionId, userId, i + 1, sub_seg,
                                        propensity1, attr_name, value, pop)
                                    cursor.execute(sql)
                                    attr.append(attr_name)
                    if prop2_flg == 1:
                        need_cols_2 = final_data_2[
                            final_data_2['cluster'].isin(
                                [d[i] + '1', d[i] + '2',
                                 d[i] + '3'])]['variable'].unique()
                        if sub_seg in prop2_seg:
                            attr1 = []
                            for k in need_cols_2:
                                value2 = profiling_data_2[
                                    (profiling_data_2['cluster'] == sub_seg)
                                    & (profiling_data_2['variable'] == k
                                       )]['score']
                                if len(value2) == 0:
                                    value = 0
                                else:
                                    value = value2.values[0]
                                if np.isnan(value):
                                    value = 0
                                value = "{0:.2f}".format(float(value))
                                attr_name = attr_data[
                                    attr_data['EXPERIAN_DB_Col_NAME'] ==
                                    k]['VAR_NAME'].values[0]
                                if attr_name not in attr1:
                                    sql = "INSERT INTO SUB_SEG_ATTR_QUAL_TMP (sessionId,userId,SEG_ID,SEGMENT_TYPE,PROPENSITY_TYPE,ATTRIBUTE_NAME,ATTRI_QUAL,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}','{}',{},'{}');".format(
                                        sessionId, userId, i + 1, sub_seg,
                                        propensity2, attr_name, value, pop)
                                    cursor.execute(sql)
                                    attr1.append(attr_name)
        con.commit()
        con.close()
        is_opened = 0
        print "Success"
        print("--- %s seconds ---" % (time.time() - start_time))

    except:
        if is_opened == 1:
            con.close()
        status = -1
        print "Failed"
        print("--- %s seconds ---" % (time.time() - start_time))
        traceback.print_exc(file=sys.stdout)

    return status

Esempio n. 17

0

Mostra file

import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from preprocess import preprocessing

cols=['IFATHER', 'NRCH17_2', 'IRHHSIZ2', 'IIHHSIZ2', 'IRKI17_2', 'IIKI17_2', 'IRHH65_2', 'IIHH65_2', 'PRXRETRY',\
      'PRXYDATA', 'MEDICARE', 'CAIDCHIP', 'CHAMPUS', 'PRVHLTIN', 'GRPHLTIN', 'HLTINNOS', 'HLCNOTYR', 'HLCNOTMO',\
      'HLCLAST', 'HLLOSRSN', 'HLNVCOST', 'HLNVOFFR', 'HLNVREF', 'HLNVNEED', 'HLNVSOR', 'IRMCDCHP', 'IIMCDCHP',\
      'IRMEDICR', 'IIMEDICR', 'IRCHMPUS', 'IICHMPUS', 'IRPRVHLT', 'IIPRVHLT', 'IROTHHLT', 'IIOTHHLT', 'HLCALLFG',\
      'HLCALL99', 'ANYHLTI2', 'IRINSUR4', 'IIINSUR4', 'OTHINS', 'CELLNOTCL', 'CELLWRKNG', 'IRFAMSOC', 'IIFAMSOC',\
      'IRFAMSSI', 'IIFAMSSI', 'IRFSTAMP', 'IIFSTAMP', 'IRFAMPMT', 'IIFAMPMT', 'IRFAMSVC', 'IIFAMSVC', 'IRWELMOS',\
      'IIWELMOS', 'IRPINC3', 'IRFAMIN3', 'IIPINC3', 'IIFAMIN3', 'GOVTPROG', 'POVERTY3', 'TOOLONG', 'TROUBUND',\
      'PDEN10', 'COUTYP2', 'MAIIN102', 'AIIND102', 'ANALWT_C', 'VESTR', 'VEREP']

train = pd.read_csv('criminal_train.csv')
train = preprocessing(train)
train_x = train[cols]
train_y = train['Criminal']
model = ExtraTreesClassifier()
model.fit(train_x, train_y)
print(list(zip(model.feature_importances_, cols)))

Esempio n. 18

0

Mostra file

import cv2
import numpy as np
from matplotlib import pyplot as plt
from random import randint
from preprocess import preprocessing
from sklearn.cluster import KMeans

# import image :
new_image = cv2.imread('./dataset/%d.png' % randint(0, 1000))

# preprocessing :
new_image = preprocessing(new_image)

# save orginal image :
cv2.imwrite("./org.jpg", new_image)

# k-means fit alg :
np_img = np.asarray(new_image)
np_img = np.argwhere(np_img == 255)
print(np_img[:, 0])
kmeans = KMeans(n_clusters=5, random_state=0).fit(np_img)

# draw point on image :
new_image = cv2.cvtColor(new_image, cv2.COLOR_GRAY2BGR)
for i in range(0, len(kmeans.cluster_centers_)):
    cv2.circle(new_image, (int(
        kmeans.cluster_centers_[i][1]), int(kmeans.cluster_centers_[i][0])), 1,
               (0, 0, 255), 2)
cv2.imwrite("./kmeans.jpg", new_image)

# draw plot :

Esempio n. 19

0

Mostra file

from preprocess import preprocessing
from cnn import build_dcnn
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Flatten, Dense, Conv2D, MaxPooling2D, Dropout, BatchNormalization, LeakyReLU, Activation, TimeDistributed, LSTM, Bidirectional, GlobalAvgPool2D, GlobalMaxPool2D, ZeroPadding2D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, TensorBoard, LearningRateScheduler
from keras import backend as K

X_train, y_train, X_val, y_val, X_test, y_test = preprocessing('./data/fer2013.csv')

train_datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.15,
    height_shift_range=0.15,
    shear_range=0.15,
    zoom_range=0.15,
    horizontal_flip=True,
)

train_datagen.fit(X_train)

optimizer = optimizers.Nadam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, name='Nadam')
 
model = build_dcnn((48,48,1))
model.compile(
    loss='categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

Esempio n. 20

0

Mostra file

# -*- coding: utf-8 -*-
"""
Created on Tue Aug 20 16:16:20 2019

@author: raahul46
"""
####DEPENDENCIES####
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from preprocess import preprocessing

####PREPROCESSING OF DATASET####
x_train, y_train, x_test, y_test = preprocessing()

####MODEL TRAINING####
print("...training model...")
classifier1 = RandomForestClassifier(
    n_estimators=300,
    criterion="entropy",
)
classifier1.fit(x_train, y_train)

####PREDICTION####
y_pred = classifier1.predict(x_test)

#SAVING THE MODEL(Pickle File)
filename = 'RF_model_final1.sav'
pickle.dump(classifier1, open(filename, 'wb'))

Esempio n. 21

0

Mostra file

def st_app(df_read):
    set_png_as_page_bg('hog-1.png')
    st.sidebar.markdown(html_temp_1.format("Team : McGonagall"),
                        unsafe_allow_html=True)
    st.sidebar.selectbox("Choose", ['Project', 'About'])
    st.markdown(html_temp_1.format("Best books of the decade:2000 "),
                unsafe_allow_html=True)
    #columns = ['Title', 'Author', 'minirating', 'num_reviews', 'num_pages', 'awards', 'genres', 'series', 'year_published', 'places']
    df = preprocessing(df=df_read)
    df = df.drop("Title_URL", axis=1)
    my_bar = st.progress(0)
    if st.checkbox("Generate Data"):
        with st.spinner("Waiting .."):
            for p in range(0, 120, 20):
                time.sleep(0.1)
                my_bar = my_bar.progress(p)
            st.dataframe(
                df.style.set_properties(
                    **{
                        'background-color': 'black',
                        'color': 'white',
                        'border-color': 'blue'
                    }))
            st.success("Generated Dataframe")
    # ANALYSIS
    ## EX-1
    st.markdown(html_temp_3.format("ANALYSIS"), unsafe_allow_html=True)
    str1 = "Group the books by `original_publish_year` and get the mean of the `minmax_norm_ratings` of the groups."
    st.markdown(html_temp_2.format(str1), unsafe_allow_html=True)
    groupby_minmax = df.groupby('year_published').agg(
        {'minmax_norm_rating': 'mean'})
    col1, col2 = st.beta_columns([2, 4])
    groupby_minmax = groupby_minmax.style.set_properties(**{
        'background-color': 'black',
        'color': 'white',
        'border-color': 'blue'
    })
    with col1:
        st.dataframe(groupby_minmax)
    with col2:
        st.area_chart(groupby_minmax)

    # EX-2
    str2 = 'Create a function that given an author as input it returns her/his book with the highest minmax_norm_ratings.'
    st.markdown(html_temp_2.format(str2), unsafe_allow_html=True)
    col3, col4 = st.beta_columns(2)
    with col3:
        auth = st.selectbox("Select Author", df['Author'].unique().tolist())
    with col4:
        st.success(authors_best(auth, df))

    st.markdown(html_temp_3.format("VISUALIZATION"), unsafe_allow_html=True)
    # EX-1
    str3 = 'Create a 2D scatterplot with `pages` on the x-axis and `num_ratings` on the y-axis.'
    st.markdown(html_temp_2.format(str3), unsafe_allow_html=True)
    # st.info("""### *1. Create a 2D scatterplot with `pages` on the x-axis and `num_ratings` on the y-axis.*""")
    ex_1 = scatter_pages_num_rating(df)
    st.plotly_chart(ex_1)
    st.subheader("Same plot using Streamlit-line_vega_chart")
    plotly_line_vega(df)

    # EX-2
    str4 = 'Can you compute numerically the correlation coefficient of these two columns?'
    st.markdown(html_temp_2.format(str4), unsafe_allow_html=True)
    # st.info("""### *2. Can you compute numerically the correlation coefficient of these two columns?* """)
    st.write(plot_correlation(df))

    # EX-3
    str5 = 'Visualise the `avg_rating` distribution.'
    st.markdown(html_temp_2.format(str5), unsafe_allow_html=True)
    # st.info("""### *3. Visualise the `avg_rating` distribution.*""")
    ex_3 = avg_rating_dist(df)
    st.plotly_chart(ex_3)

    # EX-4
    str6 = 'Visualise the `minmax_norm_rating` distribution.'
    st.markdown(html_temp_2.format(str6), unsafe_allow_html=True)
    # st.info("""### *4. Visualise the `minmax_norm_rating` distribution.*""")
    st.plotly_chart(minmax_norm_dist(df))

    # EX-5
    str7 = 'Visualise the `mean_norm_rating` distribution.'
    st.markdown(html_temp_2.format(str7), unsafe_allow_html=True)
    # st.info("""### *5. Visualise the `mean_norm_rating` distribution.*""")
    st.plotly_chart(mean_norm_dist(df))
    st.plotly_chart(all_three_dist(df))

    # EX-6
    str8 = 'Create one graph that represents in the same figure both `minmax_norm_rating` and `mean_norm_rating` distributions.'
    st.markdown(html_temp_2.format(str8), unsafe_allow_html=True)
    st.plotly_chart(norm_comparison(df))

    # EX-8
    str9 = 'Visualize the awards distribution in a boxplot and aggregtated bars.'
    st.markdown(html_temp_2.format(str9), unsafe_allow_html=True)
    st.plotly_chart(awards_boxplot(df))

    # EX-9
    str10 = 'Group the `books` by `original_publish_year` and get the mean of the `minmax_norm_ratings` of the groups.'
    st.markdown(html_temp_2.format(str10), unsafe_allow_html=True)
    st.plotly_chart(yearly_minmax_mean(df))

    # EX-10
    str11 = 'Make a scatterplot to represent minmax_norm_ratings in function of the number of awards won by the book.'
    st.markdown(html_temp_2.format(str11), unsafe_allow_html=True)
    st.plotly_chart(minmax_awards(df))  #
    # st.pyplot(minmax_awards_2(df,fig_size=(10,10))) ## Old matplotlib plot

    # EX-7 Not working
    # str12='What is the best fit in terms of a distribution (normal, chi-squared...) to represent each of those graphs?'
    # st.markdown(html_temp_2.format(str12), unsafe_allow_html=True)
    # st.image("D:/Strive/st/goodreads_best2000-main/pngs/distribution_fit.png")

    # Explore maps in streamlit
    str_m = "Books and Places."
    st.markdown(html_temp_2.format(str_m), unsafe_allow_html=True)
    book = st.selectbox("Select Book", df['Title'].unique())
    df_res = pd.DataFrame(place_title(book, df))
    place = st.write(
        df_res.style.set_properties(**{
            'background-color': 'black',
            'color': 'white',
            'border-color': 'blue'
        }))
    """### *Type the Location you received above*"""
    where = st.text_area("\n", " Type here...")
    if st.button("Submit"):
        geolocator = Nominatim(user_agent="a")
        location = geolocator.geocode(where)
        lat = location.latitude
        lon = location.longitude
        map_df = pd.DataFrame.from_dict({"lat": [lat], "lon": [lon]})
        st.map(map_df)

Esempio n. 22

0

Mostra file

File: b22_lineer_regression.py Progetto: EmreKARAgh/Machine-Learning-Courses

# -*- coding: utf-8 -*-
import preprocess
from sklearn.linear_model import LinearRegression
import pandas as pd
from matplotlib import pyplot as plt

obj = preprocess.preprocessing()
x_train, x_test, y_train, y_test = obj.getData()

lr = LinearRegression()

lr.fit(x_train, y_train)

predict = lr.predict(x_test)

predict = pd.DataFrame(data=predict, index=y_test.index, columns=['tahmin'])

comparision = pd.concat([y_test, predict], axis=1)

x_train = x_train.sort_index(
)  #indexe göre sıralama. ay-satis eşleşmesini bozmaz
y_train = y_train.sort_index()

plt.title('Aylara Göre Satış Tahmin Grafiği', color='r')
plt.xlabel('Aylar', color='r')
plt.ylabel('Satışlar', color='r')

plt.scatter(x_train, y_train)
plt.plot(x_train, y_train)
plt.plot(x_test, predict)

Esempio n. 23

0

Mostra file

File: project_3_birds.py Progetto: Drotth/NUMA01

    xaxis.set_minor_formatter(dates.DateFormatter('%H'))  # minor locator timme
    xaxis.set_tick_params(which='major', pad=18)  # sätter ner datumen 18 steg

    plt.title("Nesting box activities")
    plt.ylabel("In/out movements per hour")
    save_plot()
    plt.show()

# --------------------- TASK 7 (EXTRA) ----------------------------------------


def save_plot():
    save = input('Press s to save graph as png file or enter to skip: ')
    if(save == 's'):
        plt.savefig('bird_movements.png', bbox_inches='tight')
        print("File saved as bird_movements.png in your working directory")

if __name__ == '__main__':
    list_dates, list_data = preprocessing("birds.txt")
    convert_local_timezone()
    continue_loop = 'y'
    while (continue_loop is 'y'):
        graph_dates, graph_data, sun_indexes = compute_data()
        plot_graph(graph_dates, graph_data, sun_indexes)
        # day_night_cycle()
        continue_loop = input('Do you want to plot something more? [y/n]')
        if (continue_loop is not 'y' and continue_loop is not 'n'):
            print(
                "Since you apparently can't read, I'll shut it down for you.")
            continue_loop = 'n'

Esempio n. 24

0

Mostra file

File: test_preprocess.py Progetto: amitashnanda/Plant-Sustainability-Enhancement-Using-UAV

# import necessary packages
import os
import random
import cv2
from preprocess import preprocessing

# selecting a random image for testing
root_path = "raw_leaf_data/"
all_image_files = []
for fold in os.listdir(root_path):
    all_image_files = (all_image_files + sorted([
        os.path.join(root_path, fold, file)
        for file in os.listdir(os.path.join(root_path, fold))
    ]))

r_no = random.randint(0, len(all_image_files) - 1)
test_image = all_image_files[r_no]
print('displaying', test_image)
random_save = preprocessing(test_image)
cv2.imwrite('test.jpg', random_save)

Esempio n. 25

0

Mostra file

    'CAIDCHIP', 'HLLOSRSN', 'MAIIN102', 'IRMCDCHP'
]

# to filter -1 rows with preprocess 0.63387 w/0 0.67071


def isnt(*cols):
    for c in cols:
        if c == -1:
            return False
    return True


#train data
train = pandas.read_csv('criminal_train.csv')
train = preprocessing(train)  #0.68185 with preprocess
train_x = train[cols]
train_y = train['Criminal']

#Predict data
predict_data = pandas.read_csv('criminal_test.csv')
predict_data = preprocessing(predict_data)
predict_id = predict_data['PERID']
predict_x = predict_data[cols]

lr = LogisticRegression()

cv_score = np.mean(cross_val_score(lr, train_x, train_y, scoring='accuracy'))
print('CV score for class is {}'.format(cv_score))

lr.fit(train_x, train_y)

Esempio n. 26

0

Mostra file

def setRepo():
    repo_path = repo_entry.get()
    preprocess.preprocessing(repo_path)

Esempio n. 27

0

Mostra file

            return False
    return True


def isnt2(*cols):
    for c in cols:
        if c == -1:
            return True
    return False


##train = train[train[cols].apply(lambda x: isnt(*x), axis=1)]
##test_missing = test[test[cols].apply(lambda x: isnt2(*x), axis=1)]
##test = test[test[cols].apply(lambda x: isnt(*x), axis=1)]

train = preprocessing(train)
test = preprocessing(test)

X_train = train[cols].values

y_train = train["Criminal"].values

X_test = test[cols].values

max_features = 200000  # max value of data
maxlen = len(cols)  #70 # len of input
embed_size = 300

sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, trainable=False)(sequence_input)
x = SpatialDropout1D(0.2)(x)

Esempio n. 28

0

Mostra file

File: main.py Progetto: Xuweijia-buaa/Gated-Attention-Reader-model

def train(hp_dict, args, data_dir, save_path):
    use_chars = hp_dict['char_dim'] > 0
    # load data
    dp = preprocessing()
    data = dp.preprocess(data_dir, no_training_set=False, use_chars=use_chars)

    # build minibatch loader
    train_batch_loader = mini_batch_loader(data.training,
                                           BATCH_SIZE,
                                           sample_rate=1.0,
                                           len_bin=hp_dict['use_bin'])
    valid_batch_loader = mini_batch_loader(data.validation,
                                           BATCH_SIZE,
                                           shuffle=False,
                                           len_bin=hp_dict['use_bin'])
    test_batch_loader = mini_batch_loader(data.test,
                                          BATCH_SIZE,
                                          shuffle=False,
                                          len_bin=hp_dict['use_bin'])

    logging.info("loading word2vec file ...")
    embed_init, embed_dim = \
        load_word2vec_embeddings(data.dictionary[0], hp_dict['embed_file'],EMBED_SIZE)
    logging.info("embedding dim: {}".format(embed_dim))
    logging.info("initialize model ...")

    model = GA_reader(hp_dict['nhidden'], data.vocab_size, embed_dim,
                      embed_init, hp_dict['train_emb'], use_chars,
                      hp_dict['char_nhidden'], data.n_chars,
                      hp_dict['char_dim'], hp_dict['nlayers'],
                      hp_dict['gating_fn'], hp_dict['use_feat'],
                      hp_dict['dropout'])

    if USE_CUDA:
        model.cuda()
    logging.info("Running on cuda: {}".format(USE_CUDA))
    # training phase
    opt = torch.optim.Adam(params=filter(lambda p: p.requires_grad,
                                         model.parameters()),
                           lr=LEARNING_RATE)

    shutil.copyfile('config.py', os.path.join(save_path, 'config.py'))
    #
    # load existing best model
    if os.path.isfile(os.path.join(save_path, 'best_model.pkl')):
        print('loading previously best model')
        model.load_state_dict(
            torch.load(os.path.join(save_path, 'best_model.pkl')))
    # load existing train_model
    elif os.path.isfile(os.path.join(save_path, 'init_model.pkl')):
        print('loading init model')
        model.load_state_dict(
            torch.load(os.path.join(save_path, 'init_model.pkl')))

    logging.info('-' * 50)
    logging.info("Start training ...")
    best_valid_acc = best_test_acc = 0
    for epoch in range(NUM_EPOCHS):
        new_max = False
        if epoch >= 2:
            for param_group in opt.param_groups:
                param_group['lr'] /= 2
        model.train()
        acc = loss = n_examples = it = 0
        start = time.time()

        for dw, dw_m,qw,qw_m,dt,qt,tt,tm, \
                 answear, candidate, candi_m, cloze_pos, fnames in train_batch_loader:
            n_examples += dw.shape[0]
            feat = feat_fuc(dw, qw)
            #-------train-------#
            dw, dw_m,qw,qw_m,dt,qt,tt,tm, answear, candidate, candi_m, cloze_pos,feat=to_vars(\
           [dw, dw_m,qw,qw_m,dt,qt,tt,tm, answear, candidate, candi_m, cloze_pos,feat], use_cuda=USE_CUDA)

            loss_, acc_ = model(dw, dw_m, qw, qw_m, dt, qt, tt, tm, answear,
                                candidate, candi_m, cloze_pos,
                                feat)  # tensor.float size 1
            #print(acc_.cpu().data.numpy())
            loss += loss_.cpu().data.numpy()[0]  # numpy [1]
            acc += acc_.cpu().data.numpy()[0]
            it += 1
            opt.zero_grad()
            loss_.backward()
            clip_grad_norm(parameters=filter(lambda p: p.requires_grad,
                                             model.parameters()),
                           max_norm=GRAD_CLIP)
            opt.step()
            if it % print_every == 0 \
                    or it % len(train_batch_loader) == 0:
                spend = (time.time() - start) / 60
                statement = "Epoch: {}, it: {} (max: {}), "\
                    .format(epoch, it, len(train_batch_loader))
                statement += "loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\
                    .format(loss / print_every, acc / n_examples, spend)
                logging.info(statement)
                del acc, loss, n_examples
                acc = loss = n_examples = 0
                start = time.time()
                # save every print
                torch.save(model.state_dict(),
                           os.path.join(save_path, 'init_model.pkl'))
                # torch.save(model,os.path.join(save_path,'init_model.pkl'))
#-------valid-------#
            if it % eval_every == 0:
                start = time.time()
                model.eval()
                test_loss, test_acc = evaluate(model, valid_batch_loader,
                                               USE_CUDA)
                spend = (time.time() - start) / 60
                statement = "Valid loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\
                    .format(test_loss, test_acc, spend)
                logging.info(statement)
                if best_valid_acc < test_acc:
                    best_valid_acc = test_acc
                    new_max = True
                    # store best valid model
                    torch.save(model.state_dict(),
                               os.path.join(save_path, 'best_model.pkl'))
                    #torch.save(model,os.path.join(save_path,'best_model.pkl'))
                logging.info("Best valid acc: {:.3f}".format(best_valid_acc))
                model.train()
                start = time.time()
#-------test-------#
        start = time.time()
        model.eval()
        test_loss, test_acc = evaluate(model, test_batch_loader, USE_CUDA)
        spend = (time.time() - start) / 60
        logging.info("Test loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\
                     .format(test_loss, test_acc, spend))
        if best_test_acc < test_acc:
            best_test_acc = test_acc
        logging.info("Best test acc: {:.3f}".format(best_test_acc))

Esempio n. 29

0

Mostra file

def multi_process_contrast(input_dir):
    for i in glob.glob(os.path.join(input_dir, '*.png')):
        print(i)
        preprocessing(i)
        contrast_enhancement(i)

Esempio n. 30

0

Mostra file

File: main.py Progetto: kamalkraj/Malayalam-News-Classifier

def save_model(model, model_path):
    """Save model."""
    torch.save(model.state_dict(), model_path)


def load_model(model, model_path, use_cuda=False):
    """Load model."""
    map_location = 'cpu'
    if use_cuda and torch.cuda.is_available():
        map_location = 'cuda:0'
    model.load_state_dict(torch.load(model_path, map_location))
    return model


data_dict = preprocessing("data/train.csv", "data/test.csv",
                          "embeddings/malayalam200.txt")

train_data = data_dict["data_train"]
val_data = data_dict["data_val"]
test_data = data_dict["data_test"]

classifier = MalayalamModel(data_dict["pretrained_embeddings"],
                            data_dict["padding_idx"])

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda:0'

batch_size = 256
epochs = 20
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-2)

Esempio n. 31

0

Mostra file

File: analyzer.py Progetto: henriquepapa/Erro-Nm

def analyze():
    parser = argparse.ArgumentParser(description='CST Parser command line')
    parser.add_argument("--d", help="path of directory with the texts",
                        required=True, action="store", dest="directory_path")
    parser.add_argument("--o", help="path of directory to store results",
                        required=True, action="store", dest="analysis_path")
    parser.add_argument("--e", help="embed sentence text into cst analysis",
                        required=False, action="store", dest="embed")

    args = parser.parse_args()

    texts = []

    logging.info('Getting documents ...')
    files = os.listdir(args.directory_path)

    # if macOS
    files = [f for f in files if f != '.DS_Store']

    for f in files:
        try:
            with open(os.path.join(args.directory_path, f), 'r') as file:
                lines = file.readlines()
                texts.append(''.join(lines))
        except:
            print("The following error occurred while opening file {}: ".format(
                os.path.join(args.directory_path, f)), sys.exc_info()[0])
            raise

    """
	Prepare XML data with files tokenized by sentence
	"""
    logging.info('Preprocessing documents ...')
    generated_files = preprocessing(texts, args.analysis_path)

    """
	Select pairs of sentence to be related using word overlap
	"""
    logging.info('Selecting candidate sentence pairs ...')
    selected_pairs = select_pairs(generated_files, 0.12)

    """
	Apply rules on selected pairs
	"""
    logging.info('Applying rules ...')
    apply_rules(selected_pairs, args.analysis_path, args.embed)

    """
	Extract features
	"""
    logging.info('Extracting features ...')
    features = extract_features(selected_pairs, args.analysis_path)

    """
	Applying classifier
	"""
    logging.info('Applying classifier ...')
    multiclass_classify(selected_pairs, features,
                        args.analysis_path, args.embed)

    logging.info('Done! CST analysis out at {}'.format(args.analysis_path))