def classify_vote(self, prompt, legislator): # retrieve the ID of the legislator legislators = pd.load_csv('./house_table.csv') leg_id = legislators[legislators['name'] == legislator].iloc[0]['id'] # if the legislator doesn't exist, TODO: Do something # if legislators.empty: # get a set of bills which the legislator voted on vote_history = pd.load_csv('./data/vote_table.csv') bill_df = vote_history.loc[vote_history['person'] == leg_id] # load the embeddings of all the bills the legislator voted on yeas = set() nays = set() for index, row in bill_df.iterrows(): # add embeddings to the right set tensor = torch.load('data/tensors/' + row['bill']) if row['vote'] == 1: yeas.add(tensor) else: nays.add(tensor) # use embedder to turn the prompt into embeddings prompt_embed = self.embedder.get_embeddings(prompt, willSave=False)
def load_data(path): if path == None: path == opt.data_path if not os.path.isfile(path): data = get_info("all") data.to_csv(path) return pd.load_csv(path)
def __init__(self, file_path, **kwargs): self.file_path = file_path self.kwarguments = {'sep': '\t', 'encoding': 'cp1252'} self.kwarguments.update(kwargs) self.df = pd.load_csv(file_path, **self.kwarguments)
def get_stop_words(self): print(' -> Getting stop word list...') file = 'stopwords_list.csv' stop_words_list = [] if os.path.isfile(self.data_path + file): print(' -> Stop Words File is found') # dm = DataManager() df = pd.load_csv(self.data_path + file, encoding='utf-8') stop_words_list = df['Stopwords'].tolist() else: print(' -> Stop Words File is not found') return stop_words_list
def pre_prosseccing(self): # dm = DataManager() data = pd.load_csv(self.data_path + self.data_file_name + '.csv', encoding='utf-8') data = self.get_requirements_from_document(data) # description_reset = data.dropna(axis=0).reset_index(drop=True) description = data[self.factor] description_reset = description.dropna(axis=0).reset_index(drop=True) description = [sent.replace('\n', ' ') for sent in description_reset] with open(self.data_path + self.data_file_name + '_tm.documents', 'wb') as f: pickle.dump(description, f) # # 수정된 job_title에서 posting_id 가지고 오기 # posting_ids = data['posting_id'] # posting_list = posting_ids.to_list() # # # posting_id에 따라 description_data set 만들기 # des_data = [data['job_description'][id] for id in posting_ids] # title_data = [data['job_title'][id] for id in posting_ids] # id_list = [i for i in range(len(posting_list))] # df = pd.DataFrame({'id': posting_list, 'job_title': title_data, 'job_description': des_data, 'posting_id':posting_list}) # df.to_csv('data/doc2vec_test_data/0702/merge_0629_adj.csv', mode='w', encoding='utf-8') # 수정된 description set 불러와 데이터 전처리 수행 # data = dm.load_csv(file='data/doc2vec_test_data/0702/merge_0629_adj.csv', encoding='utf-8') sentences = self.data_text_cleansing(data) data_words = list(self.sent_to_words(sentences)) data_words_nostops = self.remove_stopwords(data_words) bigram = self.make_ngram(data_words_nostops, n=2) data_lemmatized = self.lematization(bigram) # bigram = self.make_bigram(data_words_nostops) # data_lemmatized = self.lematization(bigram) # for i in range(len(bigram)): # print(f'[{i}] : {bigram[i]}') data_lemmatized_filter = self.word_filtering(data_lemmatized) # data_lemmatized_filter = data_lemmatized for i in range(len(data_lemmatized_filter)): print(f'[{i}] : {data_lemmatized_filter[i]}') # uniquewords = self.make_unique_words(data_lemmatized) with open(self.data_path + self.data_file_name + '.corpus', 'wb') as f: pickle.dump(data_lemmatized_filter, f) self.get_word_count(data_lemmatized_filter) print('=== end preprocessing ===') return data['id'], data_lemmatized_filter
def load_and_plot_data(filename): """Load a data frame and plot each column. Args: filename (str): Path to a CSV file of data. Returns: pandas.DataFrame """ df = pd.load_csv(filename, index_col=0) df.hist() return df
def get_including_words(self): print(' -> Getting including word list...') file = 'including_words_list.csv' including_words_list = [] if os.path.isfile(self.data_path + file): print(' -> Including Words File is found') # dm = DataManager() df = pd.load_csv(self.data_path + file, encoding='utf-8') including_words_list = df['Includingwords'].tolist() else: print(' -> Including Words File is not found') print(including_words_list) return including_words_list
def main(): # load iris data iris_df = pd.load_csv('../datasets/iris.csv', Header=None).drop(0, 1) iris_train_X = iris_df.iloc[0:100, [0, 2]].values iris_train_target = iris_df.iloc[0:100, 4].values iris_train_y = np.where(iris_train_target == 'setosa', -1, 1) iris_test_X = iris_df.iloc[50:, [0, 2]].values iris_test_target = iris_df.iloc[50:, 4].values iris_test_y = np.where(iris_test_target == 'setosa', -1, 1) pnn = Perceptron(0.01, 10) print("Training perceptron with examples... ") print("Examples shape " + iris_train_X.shape + " " + iris_train_y.shape + "\n") pnn.fit(iris_train_X, iris_train_y)
def extractData(csv_path, json_path=''): # extract data from csv file df = pd.load_csv(csv_path) row, column = df.shape dict4json = dict() # # Open this part to get information for info.py # # original data # sex = df.sex # age = df.age_approx # location = df.anatom_site_general_challenge # label = df.target # # convert to our form # sex = set(sex) # age = age.unique().astype(np.int32) # location = set(location) # label = set(label) sex = base.sex() age = base.age() location = base.location() for idx in range(row): line = df.iloc[idx] name = line.image_name if line.sex not in sex: line.sex = 'Not sure' this_sex = int(np.argwhere(line.sex)) this_age = int(np.argwhere(line.age_approx.astype(np.int32))) if line.location not in location: line.location = 'empty' this_location = int(np.argwhere(line.location)) dict4json[name] = { 'sex': this_sex, 'age': this_age, 'location': this_location, 'label': label } # # Open this part if need to dump the label to json file if json_path: import json with open(json_path, 'w') as f: json.dump(dict4json, f) label = int(line.label) return dict4json
def get_synthetic_data(data_path, batch_size=64): """ Synthetic distribution data Args: train_batch_size(int): training batch size test_batch_size(int): test batch size Returns: (torch.utils.data.DataLoader): train loader (torch.utils.data.DataLoader): test loader """ training = pd.load_csv(data_path).values # pick your indices for sample 1 and sample 2: s1 = np.random.choice(range(training.shape[0]), int(0.9 * training.shape[0]), replace=False) s2 = list(set(range(training.shape[0])) - set(s1)) # extract your samples: train_data = torch.as_tensor(training[s1, :-1]) y_train_data = torch.astensor(training[s1, -1]) test_data = torch.as_tensor(training[s2, :-1]) y_test_data = torch.astensor(training[s2, -1]) # one hot n = 11 y_train_data_one_hot = torch.nn.functional.one_hot( y_train_data.to(torch.int64), n) y_test_data_one_hot = torch.nn.functional.one_hot( y_test_data.to(torch.int64), n) # create dataset and dataloaders train_dataset = torch.utils.data.TensorDataset(train_data, y_train_data_one_hot) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64) test_dataset = torch.utils.data.TensorDataset(test_data, y_test_data_one_hot) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64) return train_loader, test_loader
def get_including_words(self): print(' -> Getting including word list...') # dm = DataManager() # tfidf_file = self.data_file_name+'_tf_idf.csv' # tf_idf_df = dm.load_csv(file=self.data_path+tfidf_file, encoding='utf-8') # tf_idf_sum = tf_idf_df.iloc[-1:] # tf_idf_sum = tf_idf_sum.transpose() # print(tf_idf_sum) # print(tf_idf_sum.iloc[1]) file = self.data_file_name + '_includings_list.csv' print(file) including_words_list = [] if os.path.isfile(self.data_path + file): print(' -> Including Words File is found') df = pd.load_csv(self.data_path + file, encoding='utf-8') including_words_list = df['Includingwords'].tolist() else: print(' -> Including Words File is not found') print(including_words_list) return including_words_list
def main(): org_df = pd.load_original_movies_df() df = pd.load_csv(DATA_DIR + 'ranking.csv') movie_features = org_df.columns reviewer_features = [column for column in df.columns if column not in movie_features] # for model training I want to find the most frequent reviewer and then try and # predict his movie likes/dislikes. Then using the parameters I will have a model # for a new reviewer # df[df.isnull().sum() < 500].columns # get the most frequent reviewer reviewer = None most_reviews = 0 for column in reviewer_features: if df[column].sum(axis=1) > most_reviews: most_reviews = df[column].sum(axis=1) reviewer = column # drop all movies that they didn't review df.dropna(axis=0, subset=[reviewer,], inplace=True)
def evaluate_file(filename): df = pd.load_csv(filename) evaluate(df)
import numpy as np import pandas as pd import var import collections import math import time import os import sys from sklearn import preprocessing df = pd.load_csv('Train.csv') ######################### To Do #################### # Scale Data # Split data into different categories of features, 2 pandas # Identify all features into different cases def discreteFeatures(dictionary, df, title): for key, val in dictionary.items(): col = 'title' + val df.loc[:, col] = df.where(df.loc[:, title] == key, 1, 0) return df def scaleFeatures(df): scaler = preprocessing.StandardScaler() scaler.fit(df) scaler.transform(X_train) return scaler def
import pandas as pd import csv import numpy as np file = open('feeds.csv', 'rb') data = pd.load_csv(file) file.close() #for cleaning NaN valuues/ 0 values university_towns = [] with open('Datasets/university_towns.txt') as file: for line in file: if 'NaN' in line: # Remember this `state` until the next is found state = line else: # Otherwise, we have a city; keep `state` as last-seen university_towns.append((state, line)) #replcing Nan from numpy import NaN frame.replace({NaN:0.00})
# coding: utf-8 import pandas as dp import pandas as pd pd.load_csv('results/meds-ces.csv') pd.read_csv('results/meds-ces.csv') ces = pd.read_csv('results/meds-ces.csv') extant_ciel = pd.read_csv('input/ciel-in-concepts-dict.csv') extant_ciel any(extant_ciel[1]) extant_ciel[1] extant_ciel[0] extant_ciel.columns.values extant_ciel = pd.read_csv('input/ciel-in-concepts-dict.csv') extant_ciel.columns.values extant_ciel["voided"] list(extant_ciel["voided"]) any(list(extant_ciel["voided"])) extant_ciel = extant_ciel[extant_ciel["voided"] == 0] extant_ciel ces = ces[ces["concept"].starts_with("CIEL")] ces = ces[ces.concept.starts_with("CIEL")] ces = ces[ces.concept.startswith("CIEL")] ces = ces[ces.concept.str.startswith("CIEL")] ces = ces[!ces["concept"].isna()) ces = ces[~ces["concept"].isna()) ces = ces[~ces["concept"].isna()] ces = ces[ces.concept.startswith("CIEL")] ces = ces[ces.concept.str.startswith("CIEL")] ces ces_not_in_dict = ces[ces.concept.str.replace('CIEL:', '') not in extant_ciel] ces_not_in_dict = ces[~ces.concept.str.replace('CIEL:', '').isin(extant_ciel["ID"])]
import numpy as np import pandas as pd search_radius = 100 data = pd.load_csv('sh2.MP4.csv') num_frames = np.max(data['Frame'].values) + 1 track_counter = 0 current_tracks = {} completed_tracks = {} for k, row in data[data['Frame'] == 0].iterrows(): key = 'track{:05d}'.format(track_counter) track_counter += 1 current_tracks[key] = [ (0, (row['Column'], row['Row'])), ] for time_step in range(1, num_frames): num_active = len(current_tracks.keys()) now = np.zeros(num_active, 2) prior = np.zeros(num_active, 2) key_list = current_tracks.keys() for index, key in enumerate(key_list): now[index, :] = current_tracks[key][-1][1] if len(current_tracks[key]) > 1: prior[index, :] = current_tracks[key][-2][1] else: prior[index, :] = now[index, :]
# Calculate the rolling of 21 days volatility of any Darwin import pandas as pd df = pd.load_csv('LSV___XXX___.csv', index_col=0) df.index = pd.to_datetime(df.index, unit='ms') pd.resample('D').last().rolling(21).quote.std().dropna()
def divvy_crime_data(grid, data_fname = DATA_SAMPLE_FILENAME): pd.load_csv()
#coding: UTF-8 import pandas as pd import numpy as np files = input().split() for file in files: pd.load_csv()
test_accuracy[i] = knn.score(X_test, y_test) # Generate plot plt.title('k-NN: Varying Number of Neighbors') plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy') plt.plot(neighbors, train_accuracy, label = 'Training Accuracy') plt.legend() plt.xlabel('Number of Neighbors') plt.ylabel('Accuracy') plt.show() #REGRESSION WITH BOSTON HOUSE DATASET #first import and load dataset boston = pd.load_csv('boston.csv') #display the recent data print(boston.head()) #now create feature and Target X = boston.drop('MEDV',axis=1).Values y = boston['MEDV'].values #predict house from a single feature column no. 5 X_room = X[:,5] #you can check the type of the data type(X_rooms), type(y) #numpy arrays #reshape
1. whole_value (which is the array of the whole word embedding) so I can convert it to a matrix 2. Change GRU's loading direction adapted to persian """ import tensorflow as tf import pandas as pd import numpy as np # ─── 0 INPUT PRE-PROCESSING ───────────────────────────────────────────────────────── """word embedding algorithm""" #i will be using twitter hashtags # __ VARIABLE __ E = [] #vector E, consists of e, which are computed one-hots times L, L is the sentence? #load data data = pd.load_csv("twitter_hashtag.csv") #load each row of sentence for row in data.row: #parse the tweet, have the number of words in it #shape the sentence, word embeddings?! # compute one-hot vector for each word in the sentence for word in data['words']: #words column consists of the sentence word in a list! data['one-hot'] = word.compute_one_hot() E.append(data['one-hot'] * L) #what's L? how to multiply it? #processing the aspect and taking into account Va (embedding of aspect's vector) if len(data['aspect'] == 1): #take its e, as Va V_a = E[data['aspect_loc']] else: #if aspect is more than one word
def data_from_csv(self, filepath): """ load the dataframe using pandas lib """ self.dataframe = pd.load_csv(filepath, separator='')
def load_csv(self, fpath, *args, **kwargs): df = pd.load_csv(fpath, *args, **kwargs) return df
def read_elo_file(self): return pd.load_csv(self.file_path + self.file_name)
def _load_data_and_answers(self) -> Tuple[List[str], List[str]]: return pd.load_csv(self.TEST_FILE_PATH), pd.load_csv( self.TEST_ANSWERS_FILE_PATH)
"headers, signatures, and quoting.") (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) print(__doc__) op.print_help() print() ############################################################################### # Load some categories from the training set data = pd.load_csv('./data/Train_rev1.csv') if opts.all_categories: categories = None else: categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] if opts.filtered: remove = ('headers', 'footers', 'quotes') else: remove = ()
#!/usr/bin/env python3 import pandas from sklearn import dataset if __name__ == '__main__': dataframe = pandas.load_csv('iris.csv')
def load_by_pandas(filepath, **kwargs): sha1 = digests.sha1(filepath) identifier = "sha1:" + sha1 import pandas frame = pandas.load_csv(filepath, **kwargs) return CommaSeparatedValue(identifier, frame)