Example #1
0
def prepare_data(corpus_train, method):
    dic_json = open_json(corpus_train)
    if 'rstr' in method:  # rstr, different because we have to calculate the desc matrix beforehand
        texts = [infos["text_line"] for x, infos in dic_json.items()]
        desc, matrix = get_matrix_rstr(method, dic_json, desc_arg=None)
        v = DictVectorizer()
        X = v.fit_transform(matrix)
        y, L_IDS = [], []
        for ID, infos in dic_json.items():
            y.append(infos["label"])
            L_IDS.append(ID)
        y.append('0')
        L_IDS.append('-1')
    else:
        X, y, L_IDS = [], [], []
        desc = {}
        for ID, infos in dic_json.items():
            if 'POS' in method:
                res_vectorize = vectorize(infos, method, desc=desc)
            else:
                res_vectorize = vectorize(infos, method, desc=desc)
            X.append(res_vectorize[0])
            desc = res_vectorize[1]
        y = [infos['label'] for ID, infos in dic_json.items()]
        L_IDS = dic_json.keys()
    return X, y, L_IDS, desc
Example #2
0
def vectorizeWordSequences(sequences, converters):
	"""
	Vectorzies the sequences
	"""
	results = []

	dim = -1

	#vectorize and pad each sequence
	for seq in sequences:

		matrix = v.vectorize(seq.instances, converters)
		hasShape = len(matrix.shape) > 1

		#get the shape of the 
		if hasShape:
			(length, dim) = matrix.shape	

		else:
			length = matrix.shape[0]

		pad = n.zeros( (c.maxLen - length, dim) )

		#if the matrix is totally empty then just use the padding
		if hasShape:
			matrix = n.vstack((matrix,pad))
		else:
			matrix = pad

		results.append(matrix)
	
	return n.array(results)
def main(command, project_dir, project_name, import_samples, analyze, base_path, imports_type):
    if import_samples is not None:
        run_ghidra(command, [project_dir, project_name, "-import", import_samples])

    logs_path = os.path.join(sys.path[0], "apilogs_nonapt")
    data_path = os.path.join(sys.path[0], "data", "normalized_nonapt")
    if analyze:
        scripts_path = os.path.join(sys.path[0], "ghidra_scripts")
        apidb_path = os.path.join(sys.path[0], "data", "apidb.json")
        run_ghidra(command, [project_dir, project_name, "-process", "-noanalysis", "-readOnly",
                   "-scriptPath", scripts_path,
                   "-postScript", "aptscout.py", "all", "/v", "/log", logs_path, "/apidb", apidb_path])

    if base_path is not None and os.path.isdir(logs_path):
        dataset = [sample.replace(".json", "") for sample in os.listdir(logs_path)]
        vectorize(dataset, logs_path, base_path, imports_type, data_path)
    return 0
def movie_recommend(title):
    cosine_similarities, movie_title, indices = vectorize()

    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]

    movie_indices = [i[0] for i in sim_scores]

    print(movie_title.iloc[movie_indices].to_json())
    return movie_title.iloc[movie_indices].to_json()
Example #5
0
def setupDataSet(dataPath, eventsFile, converters, includeAll):
    """
	Preps the data for learning
	"""
    #read the event annotations
    events = readEvents(eventsFile)

    #read the data
    rawData, labels = createInstances(readDocs(dataPath, events), events,
                                      includeAll)

    #vectorize it
    data = v.vectorize(rawData, converters)

    return data, labels, events
Example #6
0
def search(dir):
    # get the well-trained model and vectors of docs
    tfidf, tfidf_vectors, docs = vectorize(dir)

    # get user's query and preprocess it
    query = input('Please input your search query: ')
    query = [query_preprocess(query)]

    # vectorize the query
    query_vector = tfidf.transform(query)

    # calculate similarity
    cosine_similarities = linear_kernel(query_vector, tfidf_vectors).flatten()
    related_docs_idx = cosine_similarities.argsort()[:-11:-1]
    print('Search Result: ')
    print(docs['Title'].loc[related_docs_idx].to_string())
Example #7
0
def setupDataSet(dataPath, eventsFile, windowConv, contextConvs):
	"""
	Preps the data for learning
	"""
	#read the event annotations
	events = readEvents(eventsFile)

	#read the data
	rawData, labels = createInstances(readDocs(dataPath, events), events)

	left = n.array([windowConv.convert(i) for i in rawData])

	#vectorize it
	right = vectorize(rawData, contextConvs)

	return (left, right), labels, [i.event for i in rawData]
Example #8
0
def preprocesssing_data(type, sign, tags, all):
    inputs, outputs = [], []
    for x in select(type, {'tag': {'$in': tags}}):
        if x['date'] > datetime(2019, 10, 15):
            try:
                prices = get_price(sign, x['date'])
                print(prices)
                if not math.isnan(prices['actual']):
                    if type == DBNAMES.BAGS_OF_WORDS or type == DBNAMES.NOUNS:
                        inputs.append({"data": vectorize(x['text_vector'], all), "date": x['date']})
                    if type == DBNAMES.NAMES_ENTITIES:
                        inputs.append({"data": vectorize_named_entities(x['text_vector'], all), "date": x['date']})
                    outputs.append(get_price_trend(prices['before'], prices['actual'], prices['after']))
            except pymongo.errors.CursorNotFound:
                print("cursor error")

    return inputs, outputs
Example #9
0
def run_batch(raster, tiles, model, weights, classes, output, gpu, cpu_only):
    net = setup_net(model, weights, gpu, cpu_only)

    # read classes metadata
    with open(classes) as c:
        colors = map(lambda x: x['color'][1:], json.load(c))
        colors.append('000000')
        colors = map(lambda rgbstr: tuple(map(ord, rgbstr.decode('hex'))),
                     colors)

    centerlines_file = op.join(output, 'complete.geojson')
    centerlines = open(centerlines_file, 'w')

    with open(tiles) as tile_list:
        for tile in tile_list:
            try:
                click.echo('processing: %s' % tile.strip())
                x, y, z = [int(t) for t in tile.strip().split('-')]
                image = get_image_tile(raster, x, y, z)
                image.save(op.join(output, '%s_real.png' % tile.strip()))

                # run prediction
                predicted_file = op.join(output, '%s.png' % tile.strip())
                make_prediction(net, colors, image, threshold, predicted_file)

                # trace raster -> polygons
                polygonized_file = op.join(output, '%s.geojson' % tile.strip())
                with open(polygonized_file, 'w') as p:
                    p.write(json.dumps(vectorize(predicted_file)))

                # polygons => centerlines
                polyspine_args = map(
                    str, [polys_to_lines, polygonized_file, x, y, z, 0.2])
                exitcode = subprocess.call(polyspine_args, stdout=centerlines)

                if exitcode != 0:
                    raise Exception('Vectorize exited nonzero')

            except TileNotFoundError:
                click.echo('Imagery tile not found.')
            except Exception as err:
                click.echo(err)

    centerlines.close()
Example #10
0
def run_batch(raster, tiles, model, weights, classes, output, gpu, cpu_only):
    net = setup_net(model, weights, gpu, cpu_only)

    # read classes metadata
    with open(classes) as c:
        colors = map(lambda x: x['color'][1:], json.load(c))
        colors.append('000000')
        colors = map(lambda rgbstr: tuple(map(ord, rgbstr.decode('hex'))), colors)

    centerlines_file = op.join(output, 'complete.geojson')
    centerlines = open(centerlines_file, 'w')

    with open(tiles) as tile_list:
        for tile in tile_list:
            try:
                click.echo('processing: %s' % tile.strip())
                x, y, z = [int(t) for t in tile.strip().split('-')]
                image = get_image_tile(raster, x, y, z)
                image.save(op.join(output, '%s_real.png' % tile.strip()))

                # run prediction
                predicted_file = op.join(output, '%s.png' % tile.strip())
                make_prediction(net, colors, image, threshold, predicted_file)

                # trace raster -> polygons
                polygonized_file = op.join(output, '%s.geojson' % tile.strip())
                with open(polygonized_file, 'w') as p:
                  p.write(json.dumps(vectorize(predicted_file)))

                # polygons => centerlines
                polyspine_args = map(str, [polys_to_lines, polygonized_file, x, y, z, 0.2])
                exitcode = subprocess.call(polyspine_args, stdout=centerlines)

                if exitcode != 0:
                    raise Exception('Vectorize exited nonzero')

            except TileNotFoundError:
                click.echo('Imagery tile not found.')
            except Exception as err:
                click.echo(err)

    centerlines.close()
Example #11
0
File = open('csv/source.csv')
Reader = csv.reader(File)

isfile = False
if os.path.isfile('csv/response.csv'):
    isfile = True

outputFile = open('csv/response.csv', 'a')
outputWriter = csv.writer(outputFile)

if not isfile:
    outputWriter.writerow(
        ['PALAVRA', 'Q1', 'Q2', 'Q3', 'Q4', 'VOGAIS', 'CONSOANTES', 'É TAG'])

k_words = []
print 'Responda para as palavras a seguir:\nÉ tag? 0-Não 1-Sim\n'
for row in Reader:
    word = row[0].decode('utf-8').lower()
    if word in k_words:
        continue
    print '%20s' % word,
    is_tag = input()
    print "\033[A                                             \033[A"
    if is_tag:
        outputWriter.writerow([word.encode('utf-8')] + vectorize(word) + [1])
    else:
        outputWriter.writerow([word.encode('utf-8')] + vectorize(word) + [0])
    k_words.append(word)
outputFile.close()
def main():


    ########################## 
    # Dataset initialization # 
    ########################## 

    print('Dataset initialization')
    
    try :

        vectors = pickle.load(open(VECTORS_PATH, 'rb'))
        xs, ys = vectors['xs'], vectors['ys']

    except FileNotFoundError:

        xs, ys = vectorize(DATA_PATH, LABEL_PATH)
        pickle.dump({ 'xs': xs, 'ys': ys }, open(VECTORS_PATH, 'wb'))
        
    print('Class Distribution Bar Graph')
    class_dist_bar(LABEL_PATH)

    ##########################
    # Parameter Optimization #
    ##########################

    print('Parameter Optimization')
    max_depth = int(len(xs[1]) * .40) - 1
    single = int(max_depth/5)

    Random Forest Parameter Grid
    rfc_param_grid = [{
        'n_estimators': [i for i in range(100, 1100, 100)],
        'max_depth': [i for i in range(2, 22, 2)],
        # 'n_jobs': [NUM_CORES],
        'random_state': [RANDOM_STATE] 
    }]
    rfc_px_len = len(rfc_param_grid[0]['n_estimators'])
    rfc_py_len = len(rfc_param_grid[0]['max_depth'])

    # Gradient Boost Parameter Grid
    gbc_param_grid = [{
        'n_estimators': [i for i in range(100, 1100, 100)],
        'max_depth': [i for i in range(2, 22, 2)],
        'random_state': [RANDOM_STATE] 
    }]
    gbc_px_len = len(gbc_param_grid[0]['n_estimators'])
    gbc_py_len = len(gbc_param_grid[0]['max_depth'])
      
    # XGBoost Parameter Grid
    xgb_param_grid = [{
        'nthread': [NUM_CORES], 
        'objective': ['binary:logistic'],
        'learning_rate': [0.05], 
        'n_estimators': [i for i in range(100, 1200, 100)],
        'max_depth': [i for i in range(2, 22, 2)],
        'seed': [RANDOM_STATE_XGB]
    }]

    xgb_px_len = len(xgb_param_grid[0]['n_estimators'])
    xgb_py_len = len(xgb_param_grid[0]['max_depth'])
    
    Random Forest
    print('\tRandom Forest')

    try :

        rfc_results = pickle.load(open(RFC_GRID_SEARCH_PATH, 'rb'))
        param_selection_heat_map(rfc_results, rfc_px_len, rfc_py_len, GRID_SEARCH_CV_FOLDS, RFC_GRID_SEARCH_GRAPH_PATH, 'Random Forest Classifier Parameter Selection')

    except FileNotFoundError:

        rfc_results = optimize_hyper_params('rfc', rfc_param_grid, xs, ys)
        pickle.dump(rfc_results, open(RFC_GRID_SEARCH_PATH, 'wb'))
        param_selection_heat_map(rfc_results, rfc_px_len, rfc_py_len, GRID_SEARCH_CV_FOLDS, RFC_GRID_SEARCH_GRAPH_PATH, 'Random Forest Classifier Parameter Selection')

    # Gradient Boosted Trees 
      
    print('\tGradient Boosted Trees')

    try:

        gbc_results = pickle.load(open(GBC_GRID_SEARCH_PATH, 'rb')) 
        param_selection_heat_map(gbc_results, gbc_px_len, gbc_py_len, GRID_SEARCH_CV_FOLDS, GBC_GRID_SEARCH_GRAPH_PATH, 'Gradient Boosted Trees Parameter Selection')

    except FileNotFoundError:

        gbc_results = optimize_hyper_params('gbc', gbc_param_grid, xs, ys)
        pickle.dump(gbc_results, open(GBC_GRID_SEARCH_PATH, 'wb'))
        param_selection_heat_map(gbc_results, gbc_px_len, gbc_py_len, GRID_SEARCH_CV_FOLDS, GBC_GRID_SEARCH_GRAPH_PATH, 'Gradient Boosted Trees Parameter Selection')

    # XGBoost

    print('\tXGBoost')
    
    try:

        xgb_results = pickle.load(open(XGB_GRID_SEARCH_PATH, 'rb')) 
        param_selection_heat_map(xgb_results, xgb_px_len, xgb_py_len, GRID_SEARCH_CV_FOLDS, XGB_GRID_SEARCH_GRAPH_PATH, 'XGBoost Parameter Selection')

    except FileNotFoundError:

        xgb_results = optimize_hyper_params('xgb', xgb_param_grid, xs, ys)
        pickle.dump(xgb_results, open(XGB_GRID_SEARCH_PATH, 'wb'))
        param_selection_heat_map(xgb_results, xgb_px_len, xgb_py_len, GRID_SEARCH_CV_FOLDS, XGB_GRID_SEARCH_GRAPH_PATH, 'XGBoost Trees Parameter Selection')

    #################### 
    # Final Train/Test # 
    #################### 

    print('Final Train/Test')

    try:

        final_scores = pickle.load(open(FINAL_RESULTS_PATH, 'rb'))

    except FileNotFoundError:

        opt_params = { 
            'rfc': {
                1:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                2:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                3:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                4:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                5:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                6:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                7:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                8:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                9:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                10: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                11: { 'n_estimators': 100, 'max_depth': 20, 'random_state': RANDOM_STATE }, 
                12: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                13: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                14: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                15: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                16: { 'n_estimators': 100, 'max_depth': 16, 'random_state': RANDOM_STATE },
                17: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
            },  
            'gbc': {
                1: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 
                2: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE },
                3: { 'n_estimators': 600, 'max_depth': 4, 'random_state': RANDOM_STATE }, 
                4: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE },
                5: { 'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                6: { 'n_estimators': 100, 'max_depth': 8, 'random_state': RANDOM_STATE },
                7: { 'n_estimators': 100, 'max_depth': 4, 'random_state': RANDOM_STATE }, 
                8: { 'n_estimators': 200, 'max_depth': 12, 'random_state': RANDOM_STATE },
                9: { 'n_estimators': 100, 'max_depth': 8, 'random_state': RANDOM_STATE }, 
                10: { 'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE },
                11: { 'n_estimators': 900, 'max_depth': 4, 'random_state': RANDOM_STATE }, 
                12: { 'n_estimators': 200, 'max_depth': 2, 'random_state': RANDOM_STATE },
                13: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 
                14: { 'n_estimators': 300, 'max_depth': 4, 'random_state': RANDOM_STATE },
                15: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 
                16: { 'n_estimators': 300, 'max_depth': 8, 'random_state': RANDOM_STATE },
                17: { 'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE },
            }   
        }   

        final_scores = {}
        for i in range(1, 18):
            rfc = RandomForestClassifier()
            rfc.set_params(**opt_params['rfc'][i])
            gbc = GradientBoostingClassifier()
            gbc.set_params(**opt_params['gbc'][i])
            # xgb = XGBClassifier()
            final_scores[i] = {}
            final_scores[i]['rfc'] = cross_val_score(rfc, xs[i], ys[i], cv=CV_FOLDS, n_jobs=NUM_CORES, scoring='f1')
            final_scores[i]['gbc'] = cross_val_score(gbc, xs[i], ys[i], cv=CV_FOLDS, n_jobs=NUM_CORES, scoring='f1')
            # final_scores[i]['xgb'] = cross_val_score(xgb, np.array(xs[i]), np.array(ys[i]), cv=CV_FOLDS, scoring='f1')

        pickle.dump(final_scores, open(FINAL_RESULTS_PATH, 'wb'))

    fig, axarr = plt.subplots(5, 4, figsize=(25, 25))
    for i in range(1, 18):

        a, b = final_scores[i]['rfc'], final_scores[i]['gbc']
        # a, b, c = final_scores[i]['rfc'], final_scores[i]['gbc'], final_scores[i]['xgb']
        row, col = int((i-1)/4), (i-1)%4
        axarr[row][col].boxplot([a, b])
        # axarr[row][col].boxplot([a, b, c])
        axarr[row][col].set_title('Body Zone %s' % (i), fontsize=28)
        axarr[row][col].set_xticklabels(['RFC', 'GBC'], fontsize=24)
        # axarr[row][col].set_xticklabels(['RFC', 'GBC', 'XGB'])
        axarr[row][col].set_ylabel('Accuracy', fontsize=24)

    for i in range(1, 4): axarr[4][i].axis('off')
    plt.suptitle("Model Comparison", fontsize=30, fontweight='bold')
    plt.tight_layout()
    plt.subplots_adjust(top=.95)
    plt.savefig(FINAL_RESULTS_GRAPH_PATH)
Example #13
0
def run_batch(queue_name, image_tiles, model, weights, classes, gpu, cpu_only):
    net = setup_net(model, weights, gpu, cpu_only)
    classes_file = resolve_s3(classes)

    # read classes metadata
    with open(classes_file) as classes:
        colors = map(lambda x: x['color'][1:], json.load(classes))
        colors.append('000000')
        colors = map(lambda rgbstr: tuple(map(ord, rgbstr.decode('hex'))),
                     colors)

    count = 0
    centerlines = tempfile.NamedTemporaryFile(suffix='.geojson', delete=False)
    click.echo('geojson output: %s' % centerlines.name)

    for message in receive(queue_name):
        try:
            click.echo('processing: %s' % message.body)
            (output_bucket, prefix, z, x, y) = json.loads(message.body)

            image = get_image_tile(image_tiles, x, y, z)

            # run prediction
            predicted = tempfile.NamedTemporaryFile(suffix='.png',
                                                    delete=False)
            make_prediction(net, colors, image, predicted)
            predicted.close()

            # upload raster prediction image
            key = '%s/%s/%s/%s.png' % (prefix, z, x, y)
            s3.upload_file(predicted.name,
                           output_bucket,
                           key,
                           ExtraArgs={'ContentType': 'image/png'})

            # trace raster -> polygons
            polygonized = tempfile.NamedTemporaryFile(suffix='.geojson',
                                                      delete=False)
            polygonized.write(json.dumps(vectorize(predicted.name)))
            polygonized.close()

            # upload polygon geojson for this tile
            key = '%s/%s/%s/%s.polygons.geojson' % (prefix, z, x, y)
            s3.upload_file(polygonized.name,
                           output_bucket,
                           key,
                           ExtraArgs={'ContentType': 'application/json'})

            # polygons => centerlines
            polyspine_args = map(
                str, [polys_to_lines, polygonized.name, x, y, z, 0.2])
            exitcode = subprocess.call(polyspine_args, stdout=centerlines)

            # clean up tempfiles
            os.remove(predicted.name)
            os.remove(polygonized.name)

            if exitcode != 0:
                raise Exception('Vectorize exited nonzero')

            # upload centerlines geojson to S3 every so often
            count += 1
            if count % 5000 == 0:
                centerlines.close()
                upload_centerlines(centerlines.name, output_bucket, prefix)
                # clear the file out and continue writing
                centerlines = open(centerlines.name, 'w+b')

            # remove message from the queue
            message.delete()
        except TileNotFoundError:
            click.echo('Imagery tile not found.')
            message.delete()
        except Exception as err:
            click.echo(err)
            try:
                message.delete()
            except Exception:
                pass

    centerlines.close()
    upload_centerlines(centerlines.name, output_bucket, prefix)
Example #14
0
    # 'french',
    # 'nazarov',
    'parupa',
    'samoan_no_vowels',
    'samoan'
]

for d in DATASETS:
    print('processing ' + d + ' dataset')
    data = 'corpora/' + d + '.txt'
    output = 'output_no_unique/' + d + '.txt'
    """
    1. Vector Embedding
    """
    print('\tvector embedding...')
    vecs, vocab = vectorize(data, unique=False)
    """
    2. Normalization
    """
    print('\tnormalization...')
    ppmi = vecs_to_ppmi(vecs)
    """
    3. PCA and Clustering
    """
    print('\tclustering...', output)
    cls = find_classes(ppmi,
                       vocab,
                       set([tuple(vocab.keys())]),
                       max_k=2,
                       max_pcs=1)
    with open(output, 'w') as out:
Example #15
0
 pickle.dump([model, desc], outFile)
 print('Model saved.')
 # 2. Annotation dev and test corpus
 print('Labeling...')
 for corpus_to_test in [dev_corpus, test_corpus]:
     corpus = open_json(corpus_to_test)
     if 'rstr' in method:
         _, matrix = get_matrix_rstr(method,
                                     corpus,
                                     desc_arg=desc)
         v = DictVectorizer()
         X = v.fit_transform(matrix)
         y_pred = model.predict(X)[:-1]
     else:
         vectors = [
             vectorize(infos, method, desc, test=True)[0]
             for instance_id, infos in corpus.items()
         ]
         if 'ngramChar' in method:
             for num_desc in desc.values():
                 if num_desc not in vectors[0]:
                     vectors[0][num_desc] = 0
             dictvectorizer = DictVectorizer(
             )  # ii. Transformation en sparse matrix
             vectors = dictvectorizer.fit_transform(vectors)
         y_pred = model.predict(vectors)
     ids = [id for id in list(corpus.keys())]
     outFile_path = config[
         'models_annots_dir'] + corpus_to_test.split(
             '/'
         )[-1].replace(
Example #16
0
    Date last modified: 9/8/16
    Python Version: 2.7
'''

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score, mean_squared_error, roc_auc_score, precision_score, recall_score

from math import sqrt
import pandas as pd
from extract import extract_dir
from vectorize import vectorize

print('\nExtracting Data...')
emails = vectorize(extract_dir("CSDMC2010_SPAM/TRAINING"), training=True)
labels = pd.read_table("CSDMC2010_SPAM/SPAMTrain.label",
                       delim_whitespace=True,
                       names=["labels", "files"])["labels"]  #SPAM = 0, HAM = 1

print('Splitting...')
X_train, X_test, y_train, y_test = train_test_split(emails,
                                                    labels,
                                                    test_size=0.33,
                                                    random_state=42)

print('Training...')
model = LogisticRegression()
model.fit(X_train, y_train)

print('Saving Model...')
Example #17
0
def classify(data):
    vect_input = np.asarray([vectorize(data, scale_size=512)]) / 256
    results = model.predict(vect_input)[0]
    return [float(x) for x in results]
Example #18
0
    'french',
    'nazarov',
    'parupa',
    'samoan_no_vowels',
    'samoan'
]

for d in DATASETS:
    print('processing ' + d + ' dataset')
    data = 'corpora/' + d + '.txt'
    output = 'output/' + d + '.txt'
    """
    1. Vector Embedding
    """
    print('\tvector embedding...')
    vecs, vocab = vectorize(data, unique=True)
    """
    2. Normalization
    """
    print('\tnormalization...')
    ppmi = vecs_to_ppmi(vecs)
    """
    3. PCA and Clustering
    """
    print('\tclustering...', output)
    cls = find_classes(ppmi,
                       vocab,
                       set([tuple(vocab.keys())]),
                       max_k=2,
                       max_pcs=1)
    with open(output, 'w') as out:
Example #19
0
import pandas as pd
import datetime

from database import insert_article_data, get_article_tokens, insert_vector_data
from get_content import get_items
from vectorize import vectorize

vec_path = './data/test.csv'

end = datetime.date.today()
start = end - datetime.timedelta(days=1)

df = get_items(start, end)
insert_article_data(df)

df = df[['article_id', 'tokens']]
df = vectorize(df)
insert_vector_data(df)
df.to_csv(vec_path, mode='a', index=False)
Example #20
0
def dice(vector_1, vector_2):
    return 2 * soft_division(
        reduce(lambda p, c: p + min(c[0], c[1]), zip(vector_1, vector_2), 0),
        reduce(lambda p, c: p + c[0] + c[1], zip(vector_1, vector_2), 0),
    )


if __name__ == "__main__":
    from argparse import ArgumentParser
    from read import read
    from preprocess import preprocess
    from vectorize import vectorize, tfidf_vectorizer
    from db import words
    from pprint import pprint

    parser = ArgumentParser()
    parser.add_argument("-i", dest="path", type=str, help="path to text")
    parser.add_argument("-r", dest="request", type=str, help="request")
    args = parser.parse_args()
    model, vectorizer, db = vectorize(tfidf_vectorizer,
                                      preprocess(read(args.path)))
    pprint([
        calc_metrics(
            metric,
            list(preprocess([args.request]))[0],
            model,
            vectorizer,
            words(db),
        ) for metric in (cossine, jaccard, dice)
    ])
Example #21
0
def get_data(key):
    print('Got ' + key)
    obj = io.BytesIO()
    s3.download_fileobj('isitanime-data-clean', key, obj)
    return np.asarray(vectorize(obj, scale_size=512)) / 256
Example #22
0
            masked_data = np.ma.masked_equal(data,
                                             data_band.GetNoDataValue(),
                                             copy=False)
        masked_data.fill_value = no_data_value
        masked_data = np.ma.fix_invalid(masked_data, copy=False)
        data = masked_data.data

    return data, transform


def define_array():
    a = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0,
                   0], [0, 1, 1, 2, 2, 3, 3, 2, 1, 0],
                  [0, 1, 2, 2, 3, 4, 4, 4, 1,
                   0], [0, 1, 2, 3, 4, 4, 5, 1, 0, 0],
                  [0, 1, 3, 4, 3, 4, 5, 1, 0,
                   0], [0, 1, 3, 4, 3, 4, 5, 1, 0, 0],
                  [0, 1, 2, 3, 4, 4, 5, 1, 0,
                   0], [0, 1, 2, 2, 3, 4, 4, 4, 1, 0],
                  [0, 1, 1, 2, 2, 3, 3, 2, 1, 0],
                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
    return a


if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)

    d, t = read_image(sys.argv[1], no_data_value=0)
    a_transform = np.array([[t[1], t[2]], [t[4], t[5]], [t[0], t[3]]])
    vectorize.vectorize(d, a_transform, range(0, 96, 1), exact=True)
Example #23
0
'''
    File name: predict.py
    Author: Austin Jacobs
    Date created: 9/8/16
    Date last modified: 9/8/16
    Python Version: 2.7
'''

from sklearn.externals import joblib
from extract import extract_file, extract_dir
from vectorize import vectorize

scale = raw_input('(1) File (2) Directory : ')

if scale == '1':
    filename = raw_input('File Name: ')
    email = [extract_file(filename)]
elif scale == '2':
    direc = raw_input('Directory: ')
    email = extract_dir(direc)
else:
    print('Invalid')
    exit()

email = vectorize(email)
model = joblib.load('Models/Filter_Model.pkl')
pred = model.predict(email)

print['SPAM' if x == 0 else 'HAM' for x in pred]
Example #24
0
from libsvm import convert_to_libsvm
import pandas as pd
import LSTM_vectorizer as LSTM_vec

# Uncomment this if you're running it for the first time
dim = 50

# GloVe
# glove = lf.load_glove("../Data/glove.6B.50d.txt")
# np.save("../Data/glove50d", glove)
glove = np.load("../Data/glove50d.npy").item()

# Training set
traindata = pd.read_csv('../Data/train.csv')
traindata = traindata.replace(np.nan, '', regex=True)
vec.vectorize(dim, glove, traindata, is_train=True)  #Ordinary vectorizing

# Test set
testdata = pd.read_csv('../Data/test.csv')
testdata = testdata.replace(np.nan, '', regex=True)
vec.vectorize(dim, glove, testdata, is_train=False)  #Ordinary vectorizing

#
# # Load np files if already vectorized
# # train_vector = np.load("../Data/train_vector.npy")
# # test_vector = np.load("../Data/test_vector.npy")
# convert_to_libsvm(train_vector, traindata, True)
# convert_to_libsvm(test_vector, testdata, False)

# lf.conv_to_csv("../Data/pred.txt")
Example #25
0
    parser = ArgumentParser()
    parser.add_argument("-i1", dest="path1", type=str, help="path to text")
    parser.add_argument("-i2", dest="path2", type=str, help="path to text")
    parser.add_argument("-r", dest="request", type=str, help="request")
    args = parser.parse_args()

    first_class_texts = list(read(args.path1))
    second_class_texts = list(read(args.path2))
    count = len(first_class_texts) + len(second_class_texts)
    classes = (
        range(0, len(first_class_texts)),
        range(len(first_class_texts), count),
    )
    preprocess.preprocessors.append(preprocess.rm_stop_words)
    model, vectorizer, db = vectorize(
        tfidf_vectorizer,
        preprocess.preprocess(first_class_texts + second_class_texts),
    )
    res = list(
        rocchio(
            list(preprocess.preprocess([args.request]))[0],
            model,
            words(db),
            vectorizer,
            classes,
            euclide,
        )
    )
    res.sort()
    pprint(res)
Example #26
0
import numpy as np
from vectorize import vectorize
import pickle
import math
import random
from nltk.stem import PorterStemmer
import os

INIT = False
pklpath = 'glove.6B.100d.pkl'
stemmer = PorterStemmer()

if (INIT):
    index_to_word, word_to_index, vec = vectorize('glove.6B.50d.txt',
                                                  limit=20000,
                                                  lemma_only=True,
                                                  pkl=True,
                                                  pklpath=pklpath)
else:
    with open(pklpath, 'rb') as f:
        (index_to_word, word_to_index, vec) = pickle.load(f)

vec_short = vec[:5001]
print('embeddings loaded!')
a = AnnoyIndex(vec.shape[1], 'angular')

for i in range(vec.shape[0]):
    a.add_item(i, vec[i])

a.build(30)
Example #27
0
from vectorize import vectorize
from text_preprocess import query_preprocess
from scipy import spatial


def calculate_dcg(items):
    dcg = 0
    i = 0
    for item in items:
        i += 1
        dcg += item / math.log(i + 1, 2)
    return dcg


# load the tfidf vectorizer and the tfidf vector of docs
tfidf, tfidf_vectors, docs = vectorize('description_doc.csv')

# load the query list and true score of rankings
relevance = pd.read_csv('relevance.csv')
test_query = relevance.columns.values[1:]
true_score = []
for col in relevance:
    true_score.append(relevance[col].tolist())
true_score = true_score[1:]

# preprocess the query in query list
query_list = []
for query in test_query:
    query_list.append([query_preprocess(query)])

# convert the query to vector
Example #28
0
            masked_data = np.ma.masked_values(data, data_band.GetNoDataValue(), copy=False)
        else:
            masked_data = np.ma.masked_equal(data, data_band.GetNoDataValue(), copy=False)
        masked_data.fill_value = no_data_value
        masked_data = np.ma.fix_invalid(masked_data, copy=False)
        data = masked_data.data

    return data, transform


def define_array():
    a = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                  [0, 1, 1, 2, 2, 3, 3, 2, 1, 0],
                  [0, 1, 2, 2, 3, 4, 4, 4, 1, 0],
                  [0, 1, 2, 3, 4, 4, 5, 1, 0, 0],
                  [0, 1, 3, 4, 3, 4, 5, 1, 0, 0],
                  [0, 1, 3, 4, 3, 4, 5, 1, 0, 0],
                  [0, 1, 2, 3, 4, 4, 5, 1, 0, 0],
                  [0, 1, 2, 2, 3, 4, 4, 4, 1, 0],
                  [0, 1, 1, 2, 2, 3, 3, 2, 1, 0],
                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                  ])
    return a

if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)

    d, t = read_image(sys.argv[1], no_data_value=0)
    a_transform = np.array([[t[1], t[2]], [t[4], t[5]], [t[0], t[3]]])
    vectorize.vectorize(d, a_transform, range(0, 96, 1), exact=True)
test_ratio = 0.2

############################
# Get data
############################
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
y_train = [l.split('/')[-2] for l in newsgroups_train.filenames]
y_train = [['Top/' + '/'.join(label.split('.'))] for label in y_train]

X_test = newsgroups_test.data
y_test = [l.split('/')[-2] for l in newsgroups_test.filenames]
y_test = [['Top/' + '/'.join(label.split('.'))] for label in y_test]

vectorizer, X_train_matrix = vectorize(X_train)

#############################
# Build the tree, train it and evaluate prediction
#############################
tree = Tree(classifier_type, X_train_matrix, y_train, max_level)
tree.train()
predict_labels = tree.predict_text(X_test, vectorizer)
eval_res = evaluate(y_test, predict_labels, 6)

print('precision:')
print(eval_res[0])
print('recall:')
print(eval_res[1])
print('F1:')
print(eval_res[2])