def load_dataset(): try: dataset["drivers"] = pd.read_csv("data/drivers.csv") dataset["races"] = pd.read_csv("data/races.csv", parse_dates=["date"]) dataset["circuits"] = pd.read_csv("data/circuits.csv") dataset["constructors"] = pd.read_csv("data/constructors.csv") except: raise Exception("Could not load data")
def __init__(self): # Vamos a leer el conjunto de datos en un dataframe de pandas. df1 = pd.read_csv('static/data/DB_2009-2010.csv') df2 = pd.read_csv('static/data/DB_2010-2011.csv') self.dataF = pd.concat( [df1, df2]) #Juntar los datos para manejar una sola base de datos self.dataF = self.data() self.dataMonths = self.dataMonths() self.data_products = self.data_products() return
def quick_start(): print('add some default albums and feature families') stock_albums() stock_families() data_csv = pd.read_csv("./csv/features_album_Lymphangitis_Texture-Intensity_CT_GTV_L.csv") load_file_to_db(data_csv,'album_1','default_qib_1','CT_GTV_L') data_csv = pd.read_csv("./csv/features_album_Lymphangitis_Texture-Intensity_CT_GTV_N.csv") load_file_to_db(data_csv,'album_1','default_qib_2','CT_GTV_N') data_csv = pd.read_csv("./csv/list_patients_outcome.csv") add_outcome(data_csv)
def run(): df = pd.read_csv('./cleanfile.csv', encoding='utf-8', sep=',') df["sentiments"] = df["content"].map( lambda c: snownlp.SnowNLP(c).sentiments) df["keywords"] = df["content"].map(getKeyWord) #engine = create_engine('mysql+pymysql://root:@127.0.0.1:3306/sina') engine = create_engine( 'mysql+mysqlconnector://root:[email protected]:3306/sina?charset=utf8&connect_timeout=10' ) dtypedict = { 'id': Integer(), 'uid': VARCHAR(length=15), 'area': VARCHAR(length=15), 'ipadd': VARCHAR(length=15), 'usertype': VARCHAR(length=10), 'agree': VARCHAR(length=10), 'cmttime': DATETIME(), 'content': TEXT, 'sentiments': DECIMAL('10,10'), 'keywords': VARCHAR(length=100), } df.to_sql(name='news', con=engine, chunksize=100000, if_exists='replace', index=True, index_label='id', dtype=dtypedict)
def readFiles(encodingFile, energyFile, encodingColsToUse, energyColsToUse, spcToRemove, spcAsPivot=None, spcToAdd=None): rowsToRemove = [] rowsAsPivot = [] energyDF = pd.read_csv(energyFile, skiprows=0, index_col=0) for i in range(energyDF.shape[0]): if energyDF.loc[i, 'species'] in spcToRemove: rowsToRemove.append(i) if spcToAdd is not None and energyDF.loc[i, 'species'] not in spcToAdd: rowsToRemove.append(i) if spcAsPivot is not None and energyDF.loc[i, 'species'] in spcAsPivot: rowsAsPivot.append(i) encoding = np.loadtxt(encodingFile, delimiter=',', skiprows=1, usecols=encodingColsToUse) energy = np.loadtxt(energyFile, delimiter=',', skiprows=1, usecols=energyColsToUse) encodingPivot = None energyPivot = None if len(rowsAsPivot) > 0: encodingPivot = encoding[rowsAsPivot, :] energyPivot = energy[rowsAsPivot] rowsToRemove.extend(rowsAsPivot) encoding = np.delete(encoding, rowsToRemove, 0) energy = np.delete(energy, rowsToRemove, 0) return encoding, energy, encodingPivot, energyPivot
def run(): df = pd.read_csv('./cleanfile.csv', encoding='utf-8', sep=',') df["sentiments"] = df["content"].map( lambda c: snownlp.SnowNLP(c).sentiments) df["keywords"] = df["content"].map(getKeyWord) df["input_time"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') #engine = create_engine('mysql+pymysql://root:@127.0.0.1:3306/sina') engine = create_engine('mysql+mysqlconnector://root:@127.0.0.1:3306/sina') dtypedict = { 'id': Integer(), 'mid': VARCHAR(length=50), 'content': TEXT, 'uid': VARCHAR(length=15), 'area': VARCHAR(length=15), 'nick': VARCHAR(length=50), 'ip': VARCHAR(length=15), 'newsid': VARCHAR(length=50), 'time': DATETIME(), 'sentiments': DECIMAL('10,10'), 'keywords': VARCHAR(length=100), 'input_time': DATETIME(), } df.to_sql(name='news', con=engine, chunksize=100000, if_exists='replace', index=True, index_label='id', dtype=dtypedict)
def compile_data(): with open("sp500tickers.pickle", "rb") as f: tickers = pickle.load(f) main_df = pd.DataFrame() for count, ticker in enumerate(tickers): df = pd.read_csv('stock_dfs/{}.csv'.format(ticker)) df.reset_index(inplace=True) df.set_index('Date', inplace=True) df.rename(columns={'Adj Close': ticker}, inplace=True) df.drop(['Open', 'High', 'Low', 'Close', 'Volume'], 1, inplace=True) if main_df.empty: main_df = df else: main_df = main_df.merge(df, how='outer') gc.collect() if count % 10 == 0: print(count) main_df.head() main_df.to_csv('sp500_joined_closes.csv')
def init_population(filename, year=2017): filepath = basepath + filename df = pandas.read_csv(filepath) num_places = len(df) df = df.drop(df.columns[0], axis=1) # print(df) population = Population(num_places) # age_ranges = Parser.get_age_ranges() population_ages = Parser.get_population_data()[year] total_pop = sum(population_ages.values()) for pop in population_ages: percentage = population_ages[pop] / total_pop population_ages[pop] = percentage for _, row in df.iterrows(): lug_pop = row['POPULATION'] home_loc = row['PLACE_ID'] for (a, b) in population_ages: percentage = population_ages[(a, b)] pop_in_range = int(round(percentage * lug_pop, 0)) population.add_batch_age_range(pop_in_range, home_loc, a, b) return population
def load_population_data(filename): filepath = basepath + filename df = pandas.read_csv(filepath) sections = {} for _, row in df.iterrows(): section_id, fensino_1bas, fensino_2bas = row['SEC'], row[ 'N_IND_RESIDENT_FENSINO_1BAS'], row['N_IND_RESIDENT_FENSINO_2BAS'] fensino_3bas, fensino_sec, fensino_possec = row[ 'N_IND_RESIDENT_FENSINO_3BAS'], row[ 'N_IND_RESIDENT_FENSINO_SEC'], row[ 'N_IND_RESIDENT_FENSINO_POSSEC'] reform, desemp = row['N_IND_RESID_PENS_REFORM'], row[ 'N_IND_RESID_DESEMP'] sections[section_id] = { 'fensino_1bas': fensino_1bas, 'fensino_2bas': fensino_2bas, 'fensino_3bas': fensino_3bas, 'fensino_sec': fensino_sec, 'fensino_possec': fensino_possec, 'reform': reform, 'desemp': desemp } # print(sections[section_id]) # print(section_id, fensino_1bas, fensino_2bas, fensino_3bas, fensino_sec, fensino_possec, reform, desemp, sep = ' - ') return sections
def add_previous_columns(drop_columns,periodicity_key, archive_name, test_size, historic_count,create_complete_csv): # read clean csv df = pandas.read_csv('../data/dataTemp.csv') # transform object to date df['cnv_date'] = pandas.to_datetime(df.cnv_date, infer_datetime_format=True) # order by date desc df = df.sort_values(by = 'cnv_date', ascending=0) # add time variable for historical values df[time_variables_columns(periodicity_key)] = previous_data_historic(periodicity_key,df) # create new data data = util_create_scenery(df,historic_count,drop_columns) # separate train and test data from an percentage parameter train, test = train_test_split(data, test_size=test_size) # create 2 csv files, from train and test test_path = '../data/results/' + archive_name + '_test' + '.csv' train_path = '../data/results/' + archive_name + '_train' + '.csv' train.to_csv(train_path,header=None,index=False) test.to_csv(test_path,header=None,index=False) # if you need complete data set if create_complete_csv: path = '../data/results/' + archive_name + '.csv' data.to_csv(path, header=None, index=False) print('*******Create Data Successfully*******') print('******* ' + archive_name + ' Done' + '*******')
def upload_csv(): try: file_name='' album_name='' qib_name='' qib_description='' csv_type='' if request.form: album_name = request.form['album_name'] qib_name = request.form['qib_name'] qib_description = request.form['qib_description'] file_name = request.form['file_name'] csv_type = request.form['csv_type'] if request.method == 'POST' and request.files: data = pd.read_csv(request.files[file_name]) headers = data.columns.values.tolist() csv_check = check_headers(headers, csv_type) if csv_check == False: return jsonify('Invalid columns') else: if csv_type == 'Custom QIB': load_custom_filter_csv_to_db(data,album_name, qib_name,qib_description) if csv_type == 'New QIB': load_file_to_db(data,album_name,qib_name,qib_description) if csv_type == 'Outcome list': add_outcome(data) return jsonify('OK') except Exception: return jsonify(f"{err.__class__.__name__}: {err}")
def main(): print('carga de archivo') print('................') print('\n') dirname = os.path.dirname(__file__) filename = os.path.join(dirname, '../data/FormatoDeAlmacenamiento1.csv') df = pd.read_csv(filename, sep=';', header=None, na_values=" NaN") #print (df) hora = df[0].str.extract('((?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d)') #print (hora) #remplazo los na por cero df2 = pd.DataFrame() df2 = df2.fillna(0) df2[0] = df[0].str.extract('((?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d)') df2[1] = df[1] #print(df2) print("se agrupa por minutos") df2[0] = pd.DatetimeIndex(df2[0]) df2.set_index(keys=0, inplace=True) ini = datetime.time(00, 18, 0) fin = datetime.time(23, 59, 0) df3 = df2[[1]].between_time(ini, fin) df3 = df3.groupby([1])[[1]].count() print(df3)
def prepare_dataset(dataset_path): ''' Read a comma separated text file where - the first field is a ID number - the second field is a class label 'B' or 'M' - the remaining fields are real-valued Return two numpy arrays X and y where - X is two dimensional. X[i,:] is the ith example - y is one dimensional. y[i] is the class label of X[i,:] y[i] should be set to 1 for 'M', and 0 for 'B' @param dataset_path: full path of the dataset text file @return X,y ''' #read the dataset dataset = pd.read_csv(dataset_path, header=None) X = dataset.drop(dataset.columns[1], axis=1) #standardizing X value for better prediction sc = StandardScaler() X = sc.fit_transform(X) #change type to string to make sure corrrect type y = dataset.iloc[:, 1].astype(str) #encode label result y = encode(y) print(X.shape) print(y.shape) return X, y
def get_recommendations(filename, recommendations_dict): # open for reading with "universal" type set doc = codecs.open(filename=filename, mode='rU', encoding='UTF-8') raw_data = pd.read_csv(doc, sep='\r') return parse_recommendations(raw_data, recommendations_dict)
def __init__(self, board_file='default.csv', number_of_players=10): # class ,name, position, monopoly, monopoly_size, price, build_cost, rent, rent_house_1, rent_house_2, rent_house_3, rent_house_4, rent_hotel, default_income self.specs = pd.read_csv(board_file).to_dict('records') self.number_of_locations = len(self.specs) self.number_of_players = number_of_players self.token_emojis = [ random_emoji() for i in range(0, number_of_players + 1) ]
def save_data(): df = pd.read_csv('./douban_book/comment_25984204.txt') df["sentiments"] = df["content"].map( lambda c: snownlp.SnowNLP(c).sentiments) engine = create_engine('mysql+pymysql://root:@127.0.0.1:3306/douban') df.to_sql(name='book', con=engine, chunksize=1000, if_exists='replace', index=None)
def process_data_for_labels(ticker): hm_days = 7 df = pd.read_csv('sp500_joined_closes.csv', index_col=0) tickers = df.columns.values.tolist() df.fillna(0, inplace=True) for i in range(1, hm_days + 1): df['{}_{}d'.format( ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker] df.fillna(0, inplace=True) return tickers, df
def calculate_offset(self, sample_values="", nominal=0): if (sample_values != ""): d = pd.read_csv(StringIO(sample_values)) self.raw_data = d d_u = d.loc[d['direction'] == 'up'] d_d = d.loc[d['direction'] == 'down'] mean_u = d_u.rotation.mean() mean_d = d_d.rotation.mean() self.offset = ((mean_d + mean_u) / 2) - nominal self.calibrated = True
def load_sections(filename): filepath = basepath + filename df = pandas.read_csv(filepath) sections = {} for _, row in df.iterrows(): section = Section(row['SEC11'], row['lat'], row['lon']) sections[row['SEC11']] = section return sections
def activities(self, curr_location): # Extracting the CSVs for indoor information arena_import = pd.read_csv('Arenas.csv', index_col=0) arenas = self.coordinates(arena_import, 2, 7, 6, curr_location) library_import = pd.read_csv('Libraries.csv', index_col=0) libraries = self.coordinates(library_import, 2, 9, 8, curr_location) museums_import = pd.read_csv('Museums_and_Galleries.csv', index_col=0) museums = self.coordinates(museums_import, 2, 4, 3, curr_location) community_import = pd.read_csv('Recreation_and_Community_Centres.csv', index_col=0) communities = self.coordinates(community_import, 2, 4, 3, curr_location) # Extracting the CSVs for outdoor information beaches_import = pd.read_csv('Beaches.csv', index_col=0) beaches = self.coordinates(beaches_import, 2, 5, 4, curr_location) campground_import = pd.read_csv('Campgrounds.csv', index_col=0) camps = self.coordinates(campground_import, 1, 5, 4, curr_location) waterfalls_import = pd.read_csv('City_Waterfalls.csv', index_col=0) falls = self.coordinates(waterfalls_import, 3, 14, 13, curr_location) pads_import = pd.read_csv('Spray_Pads.csv', index_col=0) pads = self.coordinates(pads_import, 2, 9, 8, curr_location) # Sorting the Lists by distance and categorising the information indoor = [arenas, libraries, museums, communities] outdoor = [beaches, camps, falls, pads] for i in range(len(indoor)): indoor[i].sort(key=lambda item: item.get('Distance')) for i in range(len(outdoor)): outdoor[i].sort(key=lambda item: item.get('Distance')) # Weather Information API resp = requests.get( "https://api.darksky.net/forecast/8b486e7acbd606454f4a0f8f95b56886/43.263444914224245,-79.91824930126315" ) data = resp.json() # Gets information on the temperature, current precipitation and predicted precipitation temp = data['currently']['apparentTemperature'] precip = data['currently']['precipIntensity'] predict_precip = data['currently']['precipProbability'] return self.output(temp, precip, predict_precip, indoor, outdoor)
def RunForFiltersAndRuns(baseDir, numOfFP, hiddlayers, testSetSize, filterRange, runsForFilter, runsForEnsemble): savepath = os.path.join( baseDir, 'RunResults_FP' + str(numOfFP) + '_hidd' + str(hiddlayers) + '.csv') collist = ['FilterCount', 'RunNo', 'MAE' ] + ['AE' + str(i) for i in range(testSetSize)] if not os.path.exists(savepath): df = pd.DataFrame(columns=collist) df.to_csv(savepath) for k in filterRange: print('Replication count', str(k + 1)) for i in range(runsForFilter): print('RUN', str(i + 1)) df = pd.read_csv(savepath, index_col=0) if len(df[(df['FilterCount'] == k + 1) & (df['RunNo'] == i + 1)]) > 0: print('Already computed. Continuing...') continue print('Computing...') pred = SubnetWrapper(maxC=4, maxO=4, maxH=0, num_of_fingerprints=numOfFP, replicationCount=k + 1, learn_atomtype_weights=True, learn_filter_contrib_weights=True, shareWeightsAcrossAtomTypes=True, activation=ActivationFunctions.tanh, regularizer=RegularizerWrapper(True, 0.001), hiddenLayer=hiddlayers, dropout=0.9, initializer=WeightInitializerWrapper( InitializerTypes.RandomNormal, rand_norm_stddev=0.0001), learningrate=0.001, runcnt=runsForEnsemble, tolerance=1e-3) mae, aes_for_this_run = pred.makeExtrapolatingPrdictions( testSetSize=testSetSize, splitIndices=[217, 247], max_epochs=10000, encoding1=encodings, energy1=energies, encoding2=enctest, energy2=engtest, batchsize=217, useBatchNorm=False, isVerbose=False) print('MAE for current run:', mae) df.loc[len(df)] = dict( zip(collist, [k + 1, i + 1, mae] + aes_for_this_run.tolist())) df.to_csv(savepath)
def read_data_set(): data_frame = pandas.read_csv('../data/data.csv') # transform timestamp to datetime date_set = pandas.to_datetime(data_frame.date, unit='s') data_frame['cnv_date'] = date_set # create error_columns from mq135,7 and 2 data_frame['mq2_error'] = data_frame.mq2 * 0.02 data_frame['mq7_error'] = data_frame.mq7 * 0.02 data_frame['mq135_error'] = data_frame.mq135 * 0.02 return data_frame
def readData(filename): if filename == "": raise FileNotFoundError("File not found") try: dataset = pd.read_csv(filename) X = dataset.iloc[:, :-1] y = dataset.iloc[:, -1] except: print("error") #throw to thecaller finally: return X, y print("complete")
def get_meanCV(file): CSV_file = pandas.read_csv(file) expt_samples = len(CSV_file) DNAs = 7 replicates = expt_samples/DNAs if "A13" in CSV_file["Well"].values: plate_map = Container(None, _CONTAINER_TYPES['384-pcr']) else: plate_map = Container(None, _CONTAINER_TYPES['96-pcr']) start = 0 replicate_locs = [] for i in range (0,DNAs-1): loc = [plate_map.humanize(s) for s in range(start, start + replicates)] replicate_locs.append(loc) start += replicates DNA_Ct = [] for h in replicate_locs: for x in h: Replicate_Ct_DNA = [] data_source = open(file) replicate_locations = h for line in data_source: split_line=line.split(',') wellID=split_line[0] Ct=split_line[3] for w in replicate_locations: if w == wellID: try: Replicate_Ct_DNA.append(float(Ct)) except: Replicate_Ct_DNA.append(0.0) DNA_Ct.append(Replicate_Ct_DNA) percentageCV = [] for n in DNA_Ct: try: percentageCV.append(((stats.pstdev(n)/stats.mean(n))*100)) except ZeroDivisionError as err: percentageCV.append(0.0) meanCV = stats.mean(percentageCV) for n in DNA_Ct: line = [] line.append(stats.mean(n)) line.append(stats.pstdev(n)) mean_SD.append(line) writer = csv.writer(open('./output/mean_SD.csv', 'w')) writer.writerows(mean_SD) return meanCV
def init_population_census_2011(custom_origin_index=None): print("Initializing population") filepath = basepath + "pombal-detailed.csv" df = pandas.read_csv(filepath) # total = df.iloc[[47]] df = df.drop(df.index[47]) ratio = get_resize_ratio() zones = df['Localidade'].tolist() population = Population() population.set_zones(zones) for index, row in df.iterrows(): new_num_pop = 0 for rng in AGE_RANGES: age_num = int(round(row[rng] * ratio, 0)) new_num_pop += age_num df.loc[index, rng] = age_num df.loc[index, 'Total'] = new_num_pop for _, row in df.iterrows(): lugar = row['Localidade'] if custom_origin_index is not None: lugar = zones[custom_origin_index] for i in range(len(AGE_RANGES) - 2): key = AGE_RANGES[i] (a, b) = AGE_RANGES_NUM[i] num = row[key] population.add_batch_age_range(num, lugar, a, b) adult_range_num = row[len(AGE_RANGES)] senior_range_num = row[len(AGE_RANGES) + 1] adult_distribution = get_adult_age_distribution(adult_range_num) senior_distribution = get_senior_age_distribution(senior_range_num) for (a, b) in adult_distribution: num_age_range = adult_distribution[(a, b)] population.add_batch_age_range(num_age_range, lugar, a, b) for (a, b) in senior_distribution: num_age_range = senior_distribution[(a, b)] population.add_batch_age_range(num_age_range, lugar, a, b) population.get_stats().add_age_distribution_stats( 0, population.get_population_age_distribution()) print("Population initialized - Total population: {}".format( population.get_population_size())) population.get_stats().print_population_age_stats() return population
def __init__(self): database1 = 'database.csv' database = pd.read_csv(database1) self.x = database[[u'Feature1', u'Feature2']] self.y1 = database.Target1 self.clf1 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20, 20), random_state=1) self.clf1.fit(self.x, self.y1) self.clf2 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20, 20), random_state=1)
def read_schools(self): filepath = basepath + self.schools_list df = pandas.read_csv(filepath) for _, row in df.iterrows(): lat = row['Lat'] lon = row['Lon'] name = row['School Name'] school_type = row['School Type'] zone = self.get_point_zone(lat, lon) if zone is not None: obj = {'name': name, 'zone': zone, 'type': school_type} self.schools[school_type].append(obj)
def read_workplaces(self): filepath = basepath + self.workplaces_list df = pandas.read_csv(filepath) for _, row in df.iterrows(): lat = row['Lat'] lon = row['Lon'] name = row['Name'] work_type = row['Work Type'] size = row['Size'] zone = self.get_point_zone(lat, lon) if zone is not None: obj = {'name': name, 'zone': zone, 'type': work_type, 'size': size} self.workplaces[work_type].append(obj)
def read_summaries(district): """ Read in summaries of inspection reports into a Pandas dataframe. """ # Just use the columns we need cols = [ "county", "licnum", "sitename", "streetaddy", "cityaddy", "zip", "inspnum", "insptype", "inspdispos", "inspdate", "totalvio", "highvio", "licid", "visitid" ] try: insp = pd.read_csv( district, usecols=[2, 4, 5, 6, 7, 8, 9, 12, 13, 14, 17, 18, 80, 81], names=cols, dtype=object, encoding="ISO-8859-1") except FileNotFoundError: msg = "Sorry, the csv file for" + district + "was not found." print(msg) else: #Clean up some of the data before storing it in the db insp.sitename = insp.sitename.str.title() insp.sitename = insp.sitename.str.replace('Mcdonald\'s', 'McDonald\'s') insp.sitename = insp.sitename.str.replace('Mcdonalds', 'McDonald\'s') insp.sitename = insp.sitename.str.replace('Bbq', 'BBQ') insp.sitename = insp.sitename.str.replace(r'\'S ', '\'s ') insp.streetaddy = insp.streetaddy.str.title() insp.streetaddy = insp.streetaddy.str.replace(' Sw ', ' SW ') insp.streetaddy = insp.streetaddy.str.replace(' Se ', ' SE ') insp.streetaddy = insp.streetaddy.str.replace(' Nw ', ' NW ') insp.streetaddy = insp.streetaddy.str.replace(' Ne ', ' NE ') insp.streetaddy = insp.streetaddy.str.replace(' Rd', ' Road') insp.streetaddy = insp.streetaddy.str.replace(' Sr ', ' State Road ') insp.streetaddy = insp.streetaddy.str.replace(' Ste ', ', Suite ') insp.streetaddy = insp.streetaddy.str.replace(r'(?<=[4-9])Th ', 'th ') insp.streetaddy = insp.streetaddy.str.replace(r'2Nd ', '2nd ') insp.streetaddy = insp.streetaddy.str.replace(r'3Rd ', '3rd ') insp.streetaddy = insp.streetaddy.str.replace(r' Us ', ' US ') insp.cityaddy = insp.cityaddy.str.title() insp = insp.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None) insp['visitid'] = insp['visitid'].apply( int) # so it can be filtered against df insp.inspdate = pd.to_datetime(insp.inspdate) insp.inspdate = insp.inspdate.dt.strftime('%Y, %m, %d') return insp
def main(): print ('carga de archivo') print ('................') print('\n') dirname = os.path.dirname(__file__) filename = os.path.join(dirname, '../data/FormatoDeAlmacenamiento1.csv') df = pd.read_csv(filename, sep=';', header=None, na_values=" NaN") print (df) #imprimo la informacion del archivo que cargo print (df.info()) print ('\n'*2) print(df.describe()) df.groupby(1)[2].count().plot(kind='pie',legend='Reverse') #limpio datos # print ('\n'*2)
def pget(newFile): #read from csv df = pd.read_csv(newFile) #drop NULL vakue from Answer column df.dropna(axis=0, subset=['Answer'],inplace = True) #function to check CamelCase def is_camel_case(s): if s != s.lower() and s != s.upper() and "_" not in s and sum(i.isupper() for i in s[1:-1]) == 2: return True return False #run is_camel_case function on column 'Row' in dataframe df['Row'] = df['Answer'].apply(is_camel_case) #drop row if Boolean False df.drop(df.loc[df['Row'] == False].index, inplace=True) #Save to new csv file newFileName = os.path.splitext(newFile)[0] pd.DataFrame.to_csv(df,newFileName + '_' + strftime('%Y-%m-%d') + ".csv",',')
from pandas import pandas as pd stEdges = pd.read_csv("cal.cedge.csv") stNodes = pd.read_csv("cal.cnode.csv") print "COLUMNS FOR EDGES: ", stEdges.columns print "COLUMNS FOR NODES: ", stNodes.columns startCoords = [] endCoords = [] nodes = stNodes.as_matrix() for i, edge in stEdges.iterrows(): # print edge # print edge['startID'], edge['endID'] start = int(edge['startID']) end = int(edge['endID']) startCoords.append((float(nodes[start][2]), float(nodes[start][1]))) endCoords.append((float(nodes[end][2]), float(nodes[end][1]))) #print edge['NodeID'] # which st['NodeID'] == startID startCoords = pd.Series(startCoords, name='startCoords') endCoords = pd.Series(endCoords, name='endCoords') #print startCoords df = pd.concat([stEdges['EdgeID'], startCoords, endCoords, stEdges['distance']], axis=1) # print df
def from_csv(csv_string, column_types=None, stand_in_columns=None): df = pandas.read_csv(StringIO(csv_string), dtype=column_types) _add_stand_in_columns(df, stand_in_columns) return QFrame(df)
from pandas import pandas as pd import collections, math, random, sys, time, datetime from copy import deepcopy #weights of different types of crimes CRIME_TYPE_WEIGHTS = {'ROBBERY':5, 'SEX OFFENSES, FORCIBLE':6,'DRUG/NARCOTIC':2, 'KIDNAPPING':7, 'SEX OFFENSES, NON FORCIBLE':3, 'ASSAULT':9} #number of regions to divide the city into for k-means clustering NUM_REGIONS = 10 edges = pd.read_csv("trimmed_edges.csv") #crimes = pd.read_csv("crimes_with_streets.csv") crimes = pd.read_csv("mini_crimes_set.csv") testCrimes = pd.read_csv("test_crime_data.csv") #a dictionary from edgeIDs to CrimeStreetObjects streets = {} #a dictionary from crimeStreets to a list of known crimes read in from testCrimes (crime type, time) knownCrimes = {} def getDistance(a,b): return (a[0] - b[0])*(a[0] - b[0]) + (a[1] - b[1])*(a[1] - b[1]) def kmeans(crimes, K, maxIters): ''' crimes: list of crime location and closest street pairs, ((lat, long), closestEdge) K: number of desired clusters. Assume that 0 < K <= |examples|. maxIters: maximum number of iterations to run for (you should terminate early if the algorithm converges). Return: (length K list of cluster centroids, list of assignments, (i.e. if crimes[i] belongs to centers[j], then assignments[i] = j)
def distFromStreet(self, loc): slope = (self.end[0]-self.start[0]) / (self.end[1]-self.start[1]) perp_slope = -1/slope b = self.start[1] - slope*self.start[0] b2 = loc[1] - perp_slope*loc[0] dist_lat = (b2 + b) / (slope - perp_slope) dist_long = dist_lat * slope + b # print dist_lat, dist_long dist = math.sqrt((dist_lat-loc[0])**2 + (dist_long-loc[1])**2) return dist street = CrimeStreet(1,(1.0,1.0),(2.0,2.0),math.sqrt(2)) print street.distFromStreet((2,1)) df = pd.read_csv("crimes_sub2.csv") print df.columns edges = pd.read_csv("edgeLocs.csv") # prune the edges outside of the bounds 37.5-38.5N, -122.729 & -121.888 trimmed_edges = [] fail_count = [0 for _ in range(4)] for edge in edges.iterrows(): e = edge[1] start = eval(e['startCoords']) end = eval(e['endCoords']) if start[1] > -121.888 or start[1] < -122.729: fail_count[0] += 1 continue if start[0] > 38.5 or start[0] < 37.5:
import fileinput from pandas import pandas as pd filename = "trimmed_edges.csv" edges = pd.read_csv(filename) lines = tuple(open(filename, 'r')) fout = open('sf_edges','w') i = 0 line = lines[i] fout.write(line) for edge in edges.iterrows(): i += 1 e = edge[1] startCoords = eval(e['startCoords']) endCoords = eval(e['endCoords']) if startCoords[1] > -122.35 or startCoords[1] < -122.52: continue if startCoords[0] > 37.835 or startCoords[0] < 37.7: # changing these numbers to be more refined to SF continue if endCoords[1] > -122.35 or endCoords[1] < -122.52: continue if endCoords[0] > 37.835 or endCoords[0] < 37.7: continue line = lines[i] fout.write(line) fout.close()
return min([math.sqrt((self.end[0]-loc[0])**2 + (self.end[1]-loc[1])**2), \ math.sqrt((self.start[0]-loc[0])**2 + (self.start[1]-loc[1])**2)]) #print dist_lat, dist_long dist = math.sqrt((dist_lat-loc[0])**2 + (dist_long-loc[1])**2) return dist # street = CrimeStreet(1,(1.0,1.0),(2.0,2.0),math.sqrt(2)) # print street.distFromStreet((2,1)) # st1 = CrimeStreet(8889, (37.707062, -121.93736299999999), (37.707069, -121.928421), 0.008942) # st2 = CrimeStreet(8834, (37.735077000000004, -122.400658), (37.727768, -122.40138999999999), 0.007346) # print 'FOR STREET 8889' # print st1.distFromStreet((37.7783276318163, -122.426642472038)) # print 'FOR STREET 8834' # print st2.distFromStreet((37.7783276318163, -122.426642472038)) edges = pd.read_csv("trimmed_edges.csv") print 'finished reading trimmed_edges.csv' streets = {} for edge in edges.iterrows(): e = edge[1] curr = CrimeStreet(e['EdgeID'], eval(e['startCoords']), eval(e['endCoords']), float(e['distance'])) streets[e['EdgeID']] = curr crime_data = pd.read_csv("crimes_sub2.csv") print 'finished reading crimes_sub2.csv' # print "COLUMNS FOR NODES: ", crime_data.columns print crime_data.axes cats = crime_data['Category']
metrics.v_measure_score(labels, modelAlgo.labels_), metrics.adjusted_rand_score(labels, modelAlgo.labels_) )) def EncodedColumnValues(documentDataFrame, columnToBeEncoded): """This function enumerates the string values of the columns to numbers""" modifiedDataFrame = documentDataFrame.copy() encodingTargetList = modifiedDataFrame[columnToBeEncoded].unique() convert_to_int = {strName: float(n) for n, strName in enumerate(encodingTargetList)} modifiedDataFrame["Target"+columnToBeEncoded] = modifiedDataFrame[columnToBeEncoded].replace(convert_to_int).astype(np.float64) return (modifiedDataFrame) if __name__ == '__main__': totalTime = time() """Please change the below path to where you have stored the database in your machine""" documentDataFrame= ps.read_csv('/media/nandan/Store/Python Excersises/FinalProjectML/train.csv') trainDataFrameIndex, testDataFrameIndex= train_test_split(documentDataFrame.index, train_size = 0.7) trainDataFrame = documentDataFrame.iloc[trainDataFrameIndex] testDataFrame = documentDataFrame.iloc[testDataFrameIndex] modifiedtrainDataFrame = EncodedColumnValues(trainDataFrame, "Category") modifiedtestDataFrame = EncodedColumnValues(testDataFrame, "Category") featuresList = [] featuresTestList = [] modifiedtrainDataFrame = EncodedColumnValues(modifiedtrainDataFrame, "DayOfWeek") modifiedtestDataFrame = EncodedColumnValues(modifiedtestDataFrame, "DayOfWeek") modifiedtrainDataFrame = EncodedColumnValues(modifiedtrainDataFrame, "PdDistrict") modifiedtestDataFrame = EncodedColumnValues(modifiedtestDataFrame, "PdDistrict") featuresList.append(modifiedtrainDataFrame.columns[11]) featuresList.append(modifiedtrainDataFrame.columns[7])
def main(): parser = argparse.ArgumentParser(description="""Convert conllu to conll format""") parser.add_argument("--input", help="conllu file", default="../data/en-ud-dev.conllu") parser.add_argument("--lang") parser.add_argument("--posrules", help="head POS rules file", default="../data/posrules.tsv") parser.add_argument("--output", help="target file", default="testout.conllu") parser.add_argument("--parsing_strategy", choices=["rules", "pagerank", "adjacent"], default="pagerank") parser.add_argument( "--steps", choices=["twotags", "complete", "neighbors", "verbs", "function", "content", "headrule"], nargs="+", default=[""], ) parser.add_argument("--reverse", action="store_true", default=True) parser.add_argument("--rule_backoff", choices=["cycle", "left", "right"], default="left") parser.add_argument("--ablation", choices=["pagerank", "2stepdecoding"], default="pagerank") args = parser.parse_args() if sys.version_info < (3, 0): print("Sorry, requires Python 3.x.") # suggestion: install anaconda python sys.exit(1) headrules = pd.read_csv(args.posrules, "\t") cio = CoNLLReader() orig_treebank = cio.read_conll_u(args.input) ref_treebank = cio.read_conll_u(args.input) modif_treebank = [] posbigramcounter, wordcounter = count_pos_bigrams(orig_treebank) functionlist = [x for x, y in wordcounter.most_common(100)] print(functionlist) fill_out_left_and_right_attach(posbigramcounter) if args.parsing_strategy == "pagerank": for o, ref in zip(orig_treebank, ref_treebank): s = copy.copy(o) s.remove_edges_from(s.edges()) s.remove_node( 0 ) # From here and until tree reconstruction there is no symbolic root node, makes our life a bit easier if "twotags" in args.steps: s = map_to_two_tags(s, functionlist) if "complete" in args.steps: s = add_all_edges(s) if "neighbors" in args.steps: s = add_short_edges(s) if "verbs" in args.steps: s = add_verb_edges(s) if "function" in args.steps: s = manage_function_words(s) if "content" in args.steps: s = relate_content_words(s) if "headrule" in args.steps: s = add_head_rule_edges(s, headrules) tree_decoding_algorithm_content_and_function(s, headrules, args.reverse, args.ablation) modif_treebank.append(s) if args.reverse: r = ".rev" else: r = ".norev" outfile = Path(args.lang + "_" + args.output + "_" + "_".join(args.steps) + r + ".conllu") cio.write_conll( modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False ) outfile = Path(args.lang + "_" + args.output) cio.write_conll( modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False ) elif args.parsing_strategy == "adjacent": for s in orig_treebank: s.remove_edges_from(s.edges()) s = attach_adjacent(s, args.rule_backoff) modif_treebank.append(s) outfile = Path(args.output + "." + args.rule_backoff) cio.write_conll(modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False) else: for s in orig_treebank: s = add_high_confidence_edges(s, posbigramcounter, args.rule_backoff) modif_treebank.append(s) for k in sorted(scorerdict.keys()): prec = sum([p for p, r in scorerdict[k]]) / len(scorerdict[k]) reca = sum([r for p, r in scorerdict[k]]) / len(scorerdict[k]) print("{0}, {1:.2f}, {2:.2f}".format(k, prec, reca)) outfile = Path(args.output + ".rules") cio.write_conll(modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False)
elif 21 <= int(data) < 24: return 'F1' # 21:00-24:00' elif 0 <= int(data) < 4: return 'F2' # 00:00-04:00' else: return '-' from pandas import pandas as pd # from datetime import datetime as dt # from random import random as rand # from numpy.random import randn print '1/8 - Loading data' df = pd.read_csv('dataset.csv', header=None, parse_dates=[1], names=['UserID', 'DateTime', 'AntennaID'], infer_datetime_format=True) temp = pd.DatetimeIndex(df['DateTime']) df['Date'] = temp.date df['Time'] = temp.time """ # Filtering User df_user_1 = df.groupby(['UserID'], sort=False).agg({"Date": lambda x: x.nunique()}) df_user_2 = df_user_1[df_user_1['Date'] > 2] df_user_3 = df_user_2['Date'] df_user_3.to_csv('output_df_user.csv') df_user_list = pd.read_csv('output_df_user.csv', header=None,
return True BeginList = [] for fx in fn.frames(): if isFirstBeginner(fx['name']): BeginList.append(fx['name']+'\t'+str(NumLexU(fx['name']))+'\t_') #print(len(BeginList)) print('\n'.join(sorted(BeginList))) D={} D['WHO']=['People'] D['WHAT']=['Event','Eventive_affecting'] D['WHERE']=['Locale'] D['WHY']=['Event','Eventive_affecting'] D['HOW']=[''] frames = pd.read_csv("../res/frametargetlexicon.tsv",sep="\t") framenames=set(frames.framename) #Find FN first begginers and see which ones yield a perfect match #Find the second level children of the and repeat