def prepare_data(corpus_train, method): dic_json = open_json(corpus_train) if 'rstr' in method: # rstr, different because we have to calculate the desc matrix beforehand texts = [infos["text_line"] for x, infos in dic_json.items()] desc, matrix = get_matrix_rstr(method, dic_json, desc_arg=None) v = DictVectorizer() X = v.fit_transform(matrix) y, L_IDS = [], [] for ID, infos in dic_json.items(): y.append(infos["label"]) L_IDS.append(ID) y.append('0') L_IDS.append('-1') else: X, y, L_IDS = [], [], [] desc = {} for ID, infos in dic_json.items(): if 'POS' in method: res_vectorize = vectorize(infos, method, desc=desc) else: res_vectorize = vectorize(infos, method, desc=desc) X.append(res_vectorize[0]) desc = res_vectorize[1] y = [infos['label'] for ID, infos in dic_json.items()] L_IDS = dic_json.keys() return X, y, L_IDS, desc
def vectorizeWordSequences(sequences, converters): """ Vectorzies the sequences """ results = [] dim = -1 #vectorize and pad each sequence for seq in sequences: matrix = v.vectorize(seq.instances, converters) hasShape = len(matrix.shape) > 1 #get the shape of the if hasShape: (length, dim) = matrix.shape else: length = matrix.shape[0] pad = n.zeros( (c.maxLen - length, dim) ) #if the matrix is totally empty then just use the padding if hasShape: matrix = n.vstack((matrix,pad)) else: matrix = pad results.append(matrix) return n.array(results)
def main(command, project_dir, project_name, import_samples, analyze, base_path, imports_type): if import_samples is not None: run_ghidra(command, [project_dir, project_name, "-import", import_samples]) logs_path = os.path.join(sys.path[0], "apilogs_nonapt") data_path = os.path.join(sys.path[0], "data", "normalized_nonapt") if analyze: scripts_path = os.path.join(sys.path[0], "ghidra_scripts") apidb_path = os.path.join(sys.path[0], "data", "apidb.json") run_ghidra(command, [project_dir, project_name, "-process", "-noanalysis", "-readOnly", "-scriptPath", scripts_path, "-postScript", "aptscout.py", "all", "/v", "/log", logs_path, "/apidb", apidb_path]) if base_path is not None and os.path.isdir(logs_path): dataset = [sample.replace(".json", "") for sample in os.listdir(logs_path)] vectorize(dataset, logs_path, base_path, imports_type, data_path) return 0
def movie_recommend(title): cosine_similarities, movie_title, indices = vectorize() idx = indices[title] sim_scores = list(enumerate(cosine_similarities[idx])) sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) sim_scores = sim_scores[1:31] movie_indices = [i[0] for i in sim_scores] print(movie_title.iloc[movie_indices].to_json()) return movie_title.iloc[movie_indices].to_json()
def setupDataSet(dataPath, eventsFile, converters, includeAll): """ Preps the data for learning """ #read the event annotations events = readEvents(eventsFile) #read the data rawData, labels = createInstances(readDocs(dataPath, events), events, includeAll) #vectorize it data = v.vectorize(rawData, converters) return data, labels, events
def search(dir): # get the well-trained model and vectors of docs tfidf, tfidf_vectors, docs = vectorize(dir) # get user's query and preprocess it query = input('Please input your search query: ') query = [query_preprocess(query)] # vectorize the query query_vector = tfidf.transform(query) # calculate similarity cosine_similarities = linear_kernel(query_vector, tfidf_vectors).flatten() related_docs_idx = cosine_similarities.argsort()[:-11:-1] print('Search Result: ') print(docs['Title'].loc[related_docs_idx].to_string())
def setupDataSet(dataPath, eventsFile, windowConv, contextConvs): """ Preps the data for learning """ #read the event annotations events = readEvents(eventsFile) #read the data rawData, labels = createInstances(readDocs(dataPath, events), events) left = n.array([windowConv.convert(i) for i in rawData]) #vectorize it right = vectorize(rawData, contextConvs) return (left, right), labels, [i.event for i in rawData]
def preprocesssing_data(type, sign, tags, all): inputs, outputs = [], [] for x in select(type, {'tag': {'$in': tags}}): if x['date'] > datetime(2019, 10, 15): try: prices = get_price(sign, x['date']) print(prices) if not math.isnan(prices['actual']): if type == DBNAMES.BAGS_OF_WORDS or type == DBNAMES.NOUNS: inputs.append({"data": vectorize(x['text_vector'], all), "date": x['date']}) if type == DBNAMES.NAMES_ENTITIES: inputs.append({"data": vectorize_named_entities(x['text_vector'], all), "date": x['date']}) outputs.append(get_price_trend(prices['before'], prices['actual'], prices['after'])) except pymongo.errors.CursorNotFound: print("cursor error") return inputs, outputs
def run_batch(raster, tiles, model, weights, classes, output, gpu, cpu_only): net = setup_net(model, weights, gpu, cpu_only) # read classes metadata with open(classes) as c: colors = map(lambda x: x['color'][1:], json.load(c)) colors.append('000000') colors = map(lambda rgbstr: tuple(map(ord, rgbstr.decode('hex'))), colors) centerlines_file = op.join(output, 'complete.geojson') centerlines = open(centerlines_file, 'w') with open(tiles) as tile_list: for tile in tile_list: try: click.echo('processing: %s' % tile.strip()) x, y, z = [int(t) for t in tile.strip().split('-')] image = get_image_tile(raster, x, y, z) image.save(op.join(output, '%s_real.png' % tile.strip())) # run prediction predicted_file = op.join(output, '%s.png' % tile.strip()) make_prediction(net, colors, image, threshold, predicted_file) # trace raster -> polygons polygonized_file = op.join(output, '%s.geojson' % tile.strip()) with open(polygonized_file, 'w') as p: p.write(json.dumps(vectorize(predicted_file))) # polygons => centerlines polyspine_args = map( str, [polys_to_lines, polygonized_file, x, y, z, 0.2]) exitcode = subprocess.call(polyspine_args, stdout=centerlines) if exitcode != 0: raise Exception('Vectorize exited nonzero') except TileNotFoundError: click.echo('Imagery tile not found.') except Exception as err: click.echo(err) centerlines.close()
def run_batch(raster, tiles, model, weights, classes, output, gpu, cpu_only): net = setup_net(model, weights, gpu, cpu_only) # read classes metadata with open(classes) as c: colors = map(lambda x: x['color'][1:], json.load(c)) colors.append('000000') colors = map(lambda rgbstr: tuple(map(ord, rgbstr.decode('hex'))), colors) centerlines_file = op.join(output, 'complete.geojson') centerlines = open(centerlines_file, 'w') with open(tiles) as tile_list: for tile in tile_list: try: click.echo('processing: %s' % tile.strip()) x, y, z = [int(t) for t in tile.strip().split('-')] image = get_image_tile(raster, x, y, z) image.save(op.join(output, '%s_real.png' % tile.strip())) # run prediction predicted_file = op.join(output, '%s.png' % tile.strip()) make_prediction(net, colors, image, threshold, predicted_file) # trace raster -> polygons polygonized_file = op.join(output, '%s.geojson' % tile.strip()) with open(polygonized_file, 'w') as p: p.write(json.dumps(vectorize(predicted_file))) # polygons => centerlines polyspine_args = map(str, [polys_to_lines, polygonized_file, x, y, z, 0.2]) exitcode = subprocess.call(polyspine_args, stdout=centerlines) if exitcode != 0: raise Exception('Vectorize exited nonzero') except TileNotFoundError: click.echo('Imagery tile not found.') except Exception as err: click.echo(err) centerlines.close()
File = open('csv/source.csv') Reader = csv.reader(File) isfile = False if os.path.isfile('csv/response.csv'): isfile = True outputFile = open('csv/response.csv', 'a') outputWriter = csv.writer(outputFile) if not isfile: outputWriter.writerow( ['PALAVRA', 'Q1', 'Q2', 'Q3', 'Q4', 'VOGAIS', 'CONSOANTES', 'É TAG']) k_words = [] print 'Responda para as palavras a seguir:\nÉ tag? 0-Não 1-Sim\n' for row in Reader: word = row[0].decode('utf-8').lower() if word in k_words: continue print '%20s' % word, is_tag = input() print "\033[A \033[A" if is_tag: outputWriter.writerow([word.encode('utf-8')] + vectorize(word) + [1]) else: outputWriter.writerow([word.encode('utf-8')] + vectorize(word) + [0]) k_words.append(word) outputFile.close()
def main(): ########################## # Dataset initialization # ########################## print('Dataset initialization') try : vectors = pickle.load(open(VECTORS_PATH, 'rb')) xs, ys = vectors['xs'], vectors['ys'] except FileNotFoundError: xs, ys = vectorize(DATA_PATH, LABEL_PATH) pickle.dump({ 'xs': xs, 'ys': ys }, open(VECTORS_PATH, 'wb')) print('Class Distribution Bar Graph') class_dist_bar(LABEL_PATH) ########################## # Parameter Optimization # ########################## print('Parameter Optimization') max_depth = int(len(xs[1]) * .40) - 1 single = int(max_depth/5) Random Forest Parameter Grid rfc_param_grid = [{ 'n_estimators': [i for i in range(100, 1100, 100)], 'max_depth': [i for i in range(2, 22, 2)], # 'n_jobs': [NUM_CORES], 'random_state': [RANDOM_STATE] }] rfc_px_len = len(rfc_param_grid[0]['n_estimators']) rfc_py_len = len(rfc_param_grid[0]['max_depth']) # Gradient Boost Parameter Grid gbc_param_grid = [{ 'n_estimators': [i for i in range(100, 1100, 100)], 'max_depth': [i for i in range(2, 22, 2)], 'random_state': [RANDOM_STATE] }] gbc_px_len = len(gbc_param_grid[0]['n_estimators']) gbc_py_len = len(gbc_param_grid[0]['max_depth']) # XGBoost Parameter Grid xgb_param_grid = [{ 'nthread': [NUM_CORES], 'objective': ['binary:logistic'], 'learning_rate': [0.05], 'n_estimators': [i for i in range(100, 1200, 100)], 'max_depth': [i for i in range(2, 22, 2)], 'seed': [RANDOM_STATE_XGB] }] xgb_px_len = len(xgb_param_grid[0]['n_estimators']) xgb_py_len = len(xgb_param_grid[0]['max_depth']) Random Forest print('\tRandom Forest') try : rfc_results = pickle.load(open(RFC_GRID_SEARCH_PATH, 'rb')) param_selection_heat_map(rfc_results, rfc_px_len, rfc_py_len, GRID_SEARCH_CV_FOLDS, RFC_GRID_SEARCH_GRAPH_PATH, 'Random Forest Classifier Parameter Selection') except FileNotFoundError: rfc_results = optimize_hyper_params('rfc', rfc_param_grid, xs, ys) pickle.dump(rfc_results, open(RFC_GRID_SEARCH_PATH, 'wb')) param_selection_heat_map(rfc_results, rfc_px_len, rfc_py_len, GRID_SEARCH_CV_FOLDS, RFC_GRID_SEARCH_GRAPH_PATH, 'Random Forest Classifier Parameter Selection') # Gradient Boosted Trees print('\tGradient Boosted Trees') try: gbc_results = pickle.load(open(GBC_GRID_SEARCH_PATH, 'rb')) param_selection_heat_map(gbc_results, gbc_px_len, gbc_py_len, GRID_SEARCH_CV_FOLDS, GBC_GRID_SEARCH_GRAPH_PATH, 'Gradient Boosted Trees Parameter Selection') except FileNotFoundError: gbc_results = optimize_hyper_params('gbc', gbc_param_grid, xs, ys) pickle.dump(gbc_results, open(GBC_GRID_SEARCH_PATH, 'wb')) param_selection_heat_map(gbc_results, gbc_px_len, gbc_py_len, GRID_SEARCH_CV_FOLDS, GBC_GRID_SEARCH_GRAPH_PATH, 'Gradient Boosted Trees Parameter Selection') # XGBoost print('\tXGBoost') try: xgb_results = pickle.load(open(XGB_GRID_SEARCH_PATH, 'rb')) param_selection_heat_map(xgb_results, xgb_px_len, xgb_py_len, GRID_SEARCH_CV_FOLDS, XGB_GRID_SEARCH_GRAPH_PATH, 'XGBoost Parameter Selection') except FileNotFoundError: xgb_results = optimize_hyper_params('xgb', xgb_param_grid, xs, ys) pickle.dump(xgb_results, open(XGB_GRID_SEARCH_PATH, 'wb')) param_selection_heat_map(xgb_results, xgb_px_len, xgb_py_len, GRID_SEARCH_CV_FOLDS, XGB_GRID_SEARCH_GRAPH_PATH, 'XGBoost Trees Parameter Selection') #################### # Final Train/Test # #################### print('Final Train/Test') try: final_scores = pickle.load(open(FINAL_RESULTS_PATH, 'rb')) except FileNotFoundError: opt_params = { 'rfc': { 1: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 2: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 3: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 4: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 5: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 6: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 7: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 8: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 9: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 10: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 11: { 'n_estimators': 100, 'max_depth': 20, 'random_state': RANDOM_STATE }, 12: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 13: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 14: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 15: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 16: { 'n_estimators': 100, 'max_depth': 16, 'random_state': RANDOM_STATE }, 17: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, }, 'gbc': { 1: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 2: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 3: { 'n_estimators': 600, 'max_depth': 4, 'random_state': RANDOM_STATE }, 4: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 5: { 'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE }, 6: { 'n_estimators': 100, 'max_depth': 8, 'random_state': RANDOM_STATE }, 7: { 'n_estimators': 100, 'max_depth': 4, 'random_state': RANDOM_STATE }, 8: { 'n_estimators': 200, 'max_depth': 12, 'random_state': RANDOM_STATE }, 9: { 'n_estimators': 100, 'max_depth': 8, 'random_state': RANDOM_STATE }, 10: { 'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE }, 11: { 'n_estimators': 900, 'max_depth': 4, 'random_state': RANDOM_STATE }, 12: { 'n_estimators': 200, 'max_depth': 2, 'random_state': RANDOM_STATE }, 13: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 14: { 'n_estimators': 300, 'max_depth': 4, 'random_state': RANDOM_STATE }, 15: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 16: { 'n_estimators': 300, 'max_depth': 8, 'random_state': RANDOM_STATE }, 17: { 'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE }, } } final_scores = {} for i in range(1, 18): rfc = RandomForestClassifier() rfc.set_params(**opt_params['rfc'][i]) gbc = GradientBoostingClassifier() gbc.set_params(**opt_params['gbc'][i]) # xgb = XGBClassifier() final_scores[i] = {} final_scores[i]['rfc'] = cross_val_score(rfc, xs[i], ys[i], cv=CV_FOLDS, n_jobs=NUM_CORES, scoring='f1') final_scores[i]['gbc'] = cross_val_score(gbc, xs[i], ys[i], cv=CV_FOLDS, n_jobs=NUM_CORES, scoring='f1') # final_scores[i]['xgb'] = cross_val_score(xgb, np.array(xs[i]), np.array(ys[i]), cv=CV_FOLDS, scoring='f1') pickle.dump(final_scores, open(FINAL_RESULTS_PATH, 'wb')) fig, axarr = plt.subplots(5, 4, figsize=(25, 25)) for i in range(1, 18): a, b = final_scores[i]['rfc'], final_scores[i]['gbc'] # a, b, c = final_scores[i]['rfc'], final_scores[i]['gbc'], final_scores[i]['xgb'] row, col = int((i-1)/4), (i-1)%4 axarr[row][col].boxplot([a, b]) # axarr[row][col].boxplot([a, b, c]) axarr[row][col].set_title('Body Zone %s' % (i), fontsize=28) axarr[row][col].set_xticklabels(['RFC', 'GBC'], fontsize=24) # axarr[row][col].set_xticklabels(['RFC', 'GBC', 'XGB']) axarr[row][col].set_ylabel('Accuracy', fontsize=24) for i in range(1, 4): axarr[4][i].axis('off') plt.suptitle("Model Comparison", fontsize=30, fontweight='bold') plt.tight_layout() plt.subplots_adjust(top=.95) plt.savefig(FINAL_RESULTS_GRAPH_PATH)
def run_batch(queue_name, image_tiles, model, weights, classes, gpu, cpu_only): net = setup_net(model, weights, gpu, cpu_only) classes_file = resolve_s3(classes) # read classes metadata with open(classes_file) as classes: colors = map(lambda x: x['color'][1:], json.load(classes)) colors.append('000000') colors = map(lambda rgbstr: tuple(map(ord, rgbstr.decode('hex'))), colors) count = 0 centerlines = tempfile.NamedTemporaryFile(suffix='.geojson', delete=False) click.echo('geojson output: %s' % centerlines.name) for message in receive(queue_name): try: click.echo('processing: %s' % message.body) (output_bucket, prefix, z, x, y) = json.loads(message.body) image = get_image_tile(image_tiles, x, y, z) # run prediction predicted = tempfile.NamedTemporaryFile(suffix='.png', delete=False) make_prediction(net, colors, image, predicted) predicted.close() # upload raster prediction image key = '%s/%s/%s/%s.png' % (prefix, z, x, y) s3.upload_file(predicted.name, output_bucket, key, ExtraArgs={'ContentType': 'image/png'}) # trace raster -> polygons polygonized = tempfile.NamedTemporaryFile(suffix='.geojson', delete=False) polygonized.write(json.dumps(vectorize(predicted.name))) polygonized.close() # upload polygon geojson for this tile key = '%s/%s/%s/%s.polygons.geojson' % (prefix, z, x, y) s3.upload_file(polygonized.name, output_bucket, key, ExtraArgs={'ContentType': 'application/json'}) # polygons => centerlines polyspine_args = map( str, [polys_to_lines, polygonized.name, x, y, z, 0.2]) exitcode = subprocess.call(polyspine_args, stdout=centerlines) # clean up tempfiles os.remove(predicted.name) os.remove(polygonized.name) if exitcode != 0: raise Exception('Vectorize exited nonzero') # upload centerlines geojson to S3 every so often count += 1 if count % 5000 == 0: centerlines.close() upload_centerlines(centerlines.name, output_bucket, prefix) # clear the file out and continue writing centerlines = open(centerlines.name, 'w+b') # remove message from the queue message.delete() except TileNotFoundError: click.echo('Imagery tile not found.') message.delete() except Exception as err: click.echo(err) try: message.delete() except Exception: pass centerlines.close() upload_centerlines(centerlines.name, output_bucket, prefix)
# 'french', # 'nazarov', 'parupa', 'samoan_no_vowels', 'samoan' ] for d in DATASETS: print('processing ' + d + ' dataset') data = 'corpora/' + d + '.txt' output = 'output_no_unique/' + d + '.txt' """ 1. Vector Embedding """ print('\tvector embedding...') vecs, vocab = vectorize(data, unique=False) """ 2. Normalization """ print('\tnormalization...') ppmi = vecs_to_ppmi(vecs) """ 3. PCA and Clustering """ print('\tclustering...', output) cls = find_classes(ppmi, vocab, set([tuple(vocab.keys())]), max_k=2, max_pcs=1) with open(output, 'w') as out:
pickle.dump([model, desc], outFile) print('Model saved.') # 2. Annotation dev and test corpus print('Labeling...') for corpus_to_test in [dev_corpus, test_corpus]: corpus = open_json(corpus_to_test) if 'rstr' in method: _, matrix = get_matrix_rstr(method, corpus, desc_arg=desc) v = DictVectorizer() X = v.fit_transform(matrix) y_pred = model.predict(X)[:-1] else: vectors = [ vectorize(infos, method, desc, test=True)[0] for instance_id, infos in corpus.items() ] if 'ngramChar' in method: for num_desc in desc.values(): if num_desc not in vectors[0]: vectors[0][num_desc] = 0 dictvectorizer = DictVectorizer( ) # ii. Transformation en sparse matrix vectors = dictvectorizer.fit_transform(vectors) y_pred = model.predict(vectors) ids = [id for id in list(corpus.keys())] outFile_path = config[ 'models_annots_dir'] + corpus_to_test.split( '/' )[-1].replace(
Date last modified: 9/8/16 Python Version: 2.7 ''' from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.externals import joblib from sklearn.metrics import accuracy_score, mean_squared_error, roc_auc_score, precision_score, recall_score from math import sqrt import pandas as pd from extract import extract_dir from vectorize import vectorize print('\nExtracting Data...') emails = vectorize(extract_dir("CSDMC2010_SPAM/TRAINING"), training=True) labels = pd.read_table("CSDMC2010_SPAM/SPAMTrain.label", delim_whitespace=True, names=["labels", "files"])["labels"] #SPAM = 0, HAM = 1 print('Splitting...') X_train, X_test, y_train, y_test = train_test_split(emails, labels, test_size=0.33, random_state=42) print('Training...') model = LogisticRegression() model.fit(X_train, y_train) print('Saving Model...')
def classify(data): vect_input = np.asarray([vectorize(data, scale_size=512)]) / 256 results = model.predict(vect_input)[0] return [float(x) for x in results]
'french', 'nazarov', 'parupa', 'samoan_no_vowels', 'samoan' ] for d in DATASETS: print('processing ' + d + ' dataset') data = 'corpora/' + d + '.txt' output = 'output/' + d + '.txt' """ 1. Vector Embedding """ print('\tvector embedding...') vecs, vocab = vectorize(data, unique=True) """ 2. Normalization """ print('\tnormalization...') ppmi = vecs_to_ppmi(vecs) """ 3. PCA and Clustering """ print('\tclustering...', output) cls = find_classes(ppmi, vocab, set([tuple(vocab.keys())]), max_k=2, max_pcs=1) with open(output, 'w') as out:
import pandas as pd import datetime from database import insert_article_data, get_article_tokens, insert_vector_data from get_content import get_items from vectorize import vectorize vec_path = './data/test.csv' end = datetime.date.today() start = end - datetime.timedelta(days=1) df = get_items(start, end) insert_article_data(df) df = df[['article_id', 'tokens']] df = vectorize(df) insert_vector_data(df) df.to_csv(vec_path, mode='a', index=False)
def dice(vector_1, vector_2): return 2 * soft_division( reduce(lambda p, c: p + min(c[0], c[1]), zip(vector_1, vector_2), 0), reduce(lambda p, c: p + c[0] + c[1], zip(vector_1, vector_2), 0), ) if __name__ == "__main__": from argparse import ArgumentParser from read import read from preprocess import preprocess from vectorize import vectorize, tfidf_vectorizer from db import words from pprint import pprint parser = ArgumentParser() parser.add_argument("-i", dest="path", type=str, help="path to text") parser.add_argument("-r", dest="request", type=str, help="request") args = parser.parse_args() model, vectorizer, db = vectorize(tfidf_vectorizer, preprocess(read(args.path))) pprint([ calc_metrics( metric, list(preprocess([args.request]))[0], model, vectorizer, words(db), ) for metric in (cossine, jaccard, dice) ])
def get_data(key): print('Got ' + key) obj = io.BytesIO() s3.download_fileobj('isitanime-data-clean', key, obj) return np.asarray(vectorize(obj, scale_size=512)) / 256
masked_data = np.ma.masked_equal(data, data_band.GetNoDataValue(), copy=False) masked_data.fill_value = no_data_value masked_data = np.ma.fix_invalid(masked_data, copy=False) data = masked_data.data return data, transform def define_array(): a = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 2, 2, 3, 3, 2, 1, 0], [0, 1, 2, 2, 3, 4, 4, 4, 1, 0], [0, 1, 2, 3, 4, 4, 5, 1, 0, 0], [0, 1, 3, 4, 3, 4, 5, 1, 0, 0], [0, 1, 3, 4, 3, 4, 5, 1, 0, 0], [0, 1, 2, 3, 4, 4, 5, 1, 0, 0], [0, 1, 2, 2, 3, 4, 4, 4, 1, 0], [0, 1, 1, 2, 2, 3, 3, 2, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) return a if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) d, t = read_image(sys.argv[1], no_data_value=0) a_transform = np.array([[t[1], t[2]], [t[4], t[5]], [t[0], t[3]]]) vectorize.vectorize(d, a_transform, range(0, 96, 1), exact=True)
''' File name: predict.py Author: Austin Jacobs Date created: 9/8/16 Date last modified: 9/8/16 Python Version: 2.7 ''' from sklearn.externals import joblib from extract import extract_file, extract_dir from vectorize import vectorize scale = raw_input('(1) File (2) Directory : ') if scale == '1': filename = raw_input('File Name: ') email = [extract_file(filename)] elif scale == '2': direc = raw_input('Directory: ') email = extract_dir(direc) else: print('Invalid') exit() email = vectorize(email) model = joblib.load('Models/Filter_Model.pkl') pred = model.predict(email) print['SPAM' if x == 0 else 'HAM' for x in pred]
from libsvm import convert_to_libsvm import pandas as pd import LSTM_vectorizer as LSTM_vec # Uncomment this if you're running it for the first time dim = 50 # GloVe # glove = lf.load_glove("../Data/glove.6B.50d.txt") # np.save("../Data/glove50d", glove) glove = np.load("../Data/glove50d.npy").item() # Training set traindata = pd.read_csv('../Data/train.csv') traindata = traindata.replace(np.nan, '', regex=True) vec.vectorize(dim, glove, traindata, is_train=True) #Ordinary vectorizing # Test set testdata = pd.read_csv('../Data/test.csv') testdata = testdata.replace(np.nan, '', regex=True) vec.vectorize(dim, glove, testdata, is_train=False) #Ordinary vectorizing # # # Load np files if already vectorized # # train_vector = np.load("../Data/train_vector.npy") # # test_vector = np.load("../Data/test_vector.npy") # convert_to_libsvm(train_vector, traindata, True) # convert_to_libsvm(test_vector, testdata, False) # lf.conv_to_csv("../Data/pred.txt")
parser = ArgumentParser() parser.add_argument("-i1", dest="path1", type=str, help="path to text") parser.add_argument("-i2", dest="path2", type=str, help="path to text") parser.add_argument("-r", dest="request", type=str, help="request") args = parser.parse_args() first_class_texts = list(read(args.path1)) second_class_texts = list(read(args.path2)) count = len(first_class_texts) + len(second_class_texts) classes = ( range(0, len(first_class_texts)), range(len(first_class_texts), count), ) preprocess.preprocessors.append(preprocess.rm_stop_words) model, vectorizer, db = vectorize( tfidf_vectorizer, preprocess.preprocess(first_class_texts + second_class_texts), ) res = list( rocchio( list(preprocess.preprocess([args.request]))[0], model, words(db), vectorizer, classes, euclide, ) ) res.sort() pprint(res)
import numpy as np from vectorize import vectorize import pickle import math import random from nltk.stem import PorterStemmer import os INIT = False pklpath = 'glove.6B.100d.pkl' stemmer = PorterStemmer() if (INIT): index_to_word, word_to_index, vec = vectorize('glove.6B.50d.txt', limit=20000, lemma_only=True, pkl=True, pklpath=pklpath) else: with open(pklpath, 'rb') as f: (index_to_word, word_to_index, vec) = pickle.load(f) vec_short = vec[:5001] print('embeddings loaded!') a = AnnoyIndex(vec.shape[1], 'angular') for i in range(vec.shape[0]): a.add_item(i, vec[i]) a.build(30)
from vectorize import vectorize from text_preprocess import query_preprocess from scipy import spatial def calculate_dcg(items): dcg = 0 i = 0 for item in items: i += 1 dcg += item / math.log(i + 1, 2) return dcg # load the tfidf vectorizer and the tfidf vector of docs tfidf, tfidf_vectors, docs = vectorize('description_doc.csv') # load the query list and true score of rankings relevance = pd.read_csv('relevance.csv') test_query = relevance.columns.values[1:] true_score = [] for col in relevance: true_score.append(relevance[col].tolist()) true_score = true_score[1:] # preprocess the query in query list query_list = [] for query in test_query: query_list.append([query_preprocess(query)]) # convert the query to vector
masked_data = np.ma.masked_values(data, data_band.GetNoDataValue(), copy=False) else: masked_data = np.ma.masked_equal(data, data_band.GetNoDataValue(), copy=False) masked_data.fill_value = no_data_value masked_data = np.ma.fix_invalid(masked_data, copy=False) data = masked_data.data return data, transform def define_array(): a = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 2, 2, 3, 3, 2, 1, 0], [0, 1, 2, 2, 3, 4, 4, 4, 1, 0], [0, 1, 2, 3, 4, 4, 5, 1, 0, 0], [0, 1, 3, 4, 3, 4, 5, 1, 0, 0], [0, 1, 3, 4, 3, 4, 5, 1, 0, 0], [0, 1, 2, 3, 4, 4, 5, 1, 0, 0], [0, 1, 2, 2, 3, 4, 4, 4, 1, 0], [0, 1, 1, 2, 2, 3, 3, 2, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ]) return a if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) d, t = read_image(sys.argv[1], no_data_value=0) a_transform = np.array([[t[1], t[2]], [t[4], t[5]], [t[0], t[3]]]) vectorize.vectorize(d, a_transform, range(0, 96, 1), exact=True)
test_ratio = 0.2 ############################ # Get data ############################ newsgroups_train = fetch_20newsgroups(subset='train') newsgroups_test = fetch_20newsgroups(subset='test') X_train = newsgroups_train.data y_train = [l.split('/')[-2] for l in newsgroups_train.filenames] y_train = [['Top/' + '/'.join(label.split('.'))] for label in y_train] X_test = newsgroups_test.data y_test = [l.split('/')[-2] for l in newsgroups_test.filenames] y_test = [['Top/' + '/'.join(label.split('.'))] for label in y_test] vectorizer, X_train_matrix = vectorize(X_train) ############################# # Build the tree, train it and evaluate prediction ############################# tree = Tree(classifier_type, X_train_matrix, y_train, max_level) tree.train() predict_labels = tree.predict_text(X_test, vectorizer) eval_res = evaluate(y_test, predict_labels, 6) print('precision:') print(eval_res[0]) print('recall:') print(eval_res[1]) print('F1:') print(eval_res[2])