def load_dataset():
    try:
        dataset["drivers"] = pd.read_csv("data/drivers.csv")
        dataset["races"] = pd.read_csv("data/races.csv", parse_dates=["date"])
        dataset["circuits"] = pd.read_csv("data/circuits.csv")
        dataset["constructors"] = pd.read_csv("data/constructors.csv")
    except:
        raise Exception("Could not load data")
Exemple #2
0
 def __init__(self):
     # Vamos a leer el conjunto de datos en un dataframe de pandas.
     df1 = pd.read_csv('static/data/DB_2009-2010.csv')
     df2 = pd.read_csv('static/data/DB_2010-2011.csv')
     self.dataF = pd.concat(
         [df1, df2])  #Juntar los datos para manejar una sola base de datos
     self.dataF = self.data()
     self.dataMonths = self.dataMonths()
     self.data_products = self.data_products()
     return
Exemple #3
0
def quick_start():
    print('add some default albums and feature families')
    stock_albums()
    stock_families()
    data_csv = pd.read_csv("./csv/features_album_Lymphangitis_Texture-Intensity_CT_GTV_L.csv")
    load_file_to_db(data_csv,'album_1','default_qib_1','CT_GTV_L')
    data_csv = pd.read_csv("./csv/features_album_Lymphangitis_Texture-Intensity_CT_GTV_N.csv")
    load_file_to_db(data_csv,'album_1','default_qib_2','CT_GTV_N')
    data_csv =  pd.read_csv("./csv/list_patients_outcome.csv")
    add_outcome(data_csv)
def run():
    df = pd.read_csv('./cleanfile.csv', encoding='utf-8', sep=',')
    df["sentiments"] = df["content"].map(
        lambda c: snownlp.SnowNLP(c).sentiments)
    df["keywords"] = df["content"].map(getKeyWord)

    #engine = create_engine('mysql+pymysql://root:@127.0.0.1:3306/sina')
    engine = create_engine(
        'mysql+mysqlconnector://root:[email protected]:3306/sina?charset=utf8&connect_timeout=10'
    )

    dtypedict = {
        'id': Integer(),
        'uid': VARCHAR(length=15),
        'area': VARCHAR(length=15),
        'ipadd': VARCHAR(length=15),
        'usertype': VARCHAR(length=10),
        'agree': VARCHAR(length=10),
        'cmttime': DATETIME(),
        'content': TEXT,
        'sentiments': DECIMAL('10,10'),
        'keywords': VARCHAR(length=100),
    }
    df.to_sql(name='news',
              con=engine,
              chunksize=100000,
              if_exists='replace',
              index=True,
              index_label='id',
              dtype=dtypedict)
def readFiles(encodingFile,
              energyFile,
              encodingColsToUse,
              energyColsToUse,
              spcToRemove,
              spcAsPivot=None,
              spcToAdd=None):
    rowsToRemove = []
    rowsAsPivot = []
    energyDF = pd.read_csv(energyFile, skiprows=0, index_col=0)
    for i in range(energyDF.shape[0]):
        if energyDF.loc[i, 'species'] in spcToRemove:
            rowsToRemove.append(i)
        if spcToAdd is not None and energyDF.loc[i, 'species'] not in spcToAdd:
            rowsToRemove.append(i)
        if spcAsPivot is not None and energyDF.loc[i, 'species'] in spcAsPivot:
            rowsAsPivot.append(i)
    encoding = np.loadtxt(encodingFile,
                          delimiter=',',
                          skiprows=1,
                          usecols=encodingColsToUse)
    energy = np.loadtxt(energyFile,
                        delimiter=',',
                        skiprows=1,
                        usecols=energyColsToUse)
    encodingPivot = None
    energyPivot = None
    if len(rowsAsPivot) > 0:
        encodingPivot = encoding[rowsAsPivot, :]
        energyPivot = energy[rowsAsPivot]
    rowsToRemove.extend(rowsAsPivot)
    encoding = np.delete(encoding, rowsToRemove, 0)
    energy = np.delete(energy, rowsToRemove, 0)
    return encoding, energy, encodingPivot, energyPivot
Exemple #6
0
def run():
    df = pd.read_csv('./cleanfile.csv', encoding='utf-8', sep=',')
    df["sentiments"] = df["content"].map(
        lambda c: snownlp.SnowNLP(c).sentiments)
    df["keywords"] = df["content"].map(getKeyWord)
    df["input_time"] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    #engine = create_engine('mysql+pymysql://root:@127.0.0.1:3306/sina')
    engine = create_engine('mysql+mysqlconnector://root:@127.0.0.1:3306/sina')

    dtypedict = {
        'id': Integer(),
        'mid': VARCHAR(length=50),
        'content': TEXT,
        'uid': VARCHAR(length=15),
        'area': VARCHAR(length=15),
        'nick': VARCHAR(length=50),
        'ip': VARCHAR(length=15),
        'newsid': VARCHAR(length=50),
        'time': DATETIME(),
        'sentiments': DECIMAL('10,10'),
        'keywords': VARCHAR(length=100),
        'input_time': DATETIME(),
    }
    df.to_sql(name='news',
              con=engine,
              chunksize=100000,
              if_exists='replace',
              index=True,
              index_label='id',
              dtype=dtypedict)
Exemple #7
0
def compile_data():
    with open("sp500tickers.pickle", "rb") as f:
        tickers = pickle.load(f)

    main_df = pd.DataFrame()

    for count, ticker in enumerate(tickers):
        df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
        df.reset_index(inplace=True)
        df.set_index('Date', inplace=True)

        df.rename(columns={'Adj Close': ticker}, inplace=True)
        df.drop(['Open', 'High', 'Low', 'Close', 'Volume'], 1, inplace=True)

        if main_df.empty:
            main_df = df
        else:
            main_df = main_df.merge(df, how='outer')

        gc.collect()

        if count % 10 == 0:
            print(count)

    main_df.head()
    main_df.to_csv('sp500_joined_closes.csv')
def init_population(filename, year=2017):
    filepath = basepath + filename
    df = pandas.read_csv(filepath)

    num_places = len(df)
    df = df.drop(df.columns[0], axis=1)
    # print(df)

    population = Population(num_places)
    # age_ranges = Parser.get_age_ranges()
    population_ages = Parser.get_population_data()[year]

    total_pop = sum(population_ages.values())
    for pop in population_ages:
        percentage = population_ages[pop] / total_pop
        population_ages[pop] = percentage

    for _, row in df.iterrows():
        lug_pop = row['POPULATION']
        home_loc = row['PLACE_ID']

        for (a, b) in population_ages:
            percentage = population_ages[(a, b)]
            pop_in_range = int(round(percentage * lug_pop, 0))
            population.add_batch_age_range(pop_in_range, home_loc, a, b)

    return population
Exemple #9
0
def load_population_data(filename):
    filepath = basepath + filename
    df = pandas.read_csv(filepath)

    sections = {}

    for _, row in df.iterrows():
        section_id, fensino_1bas, fensino_2bas = row['SEC'], row[
            'N_IND_RESIDENT_FENSINO_1BAS'], row['N_IND_RESIDENT_FENSINO_2BAS']
        fensino_3bas, fensino_sec, fensino_possec = row[
            'N_IND_RESIDENT_FENSINO_3BAS'], row[
                'N_IND_RESIDENT_FENSINO_SEC'], row[
                    'N_IND_RESIDENT_FENSINO_POSSEC']
        reform, desemp = row['N_IND_RESID_PENS_REFORM'], row[
            'N_IND_RESID_DESEMP']

        sections[section_id] = {
            'fensino_1bas': fensino_1bas,
            'fensino_2bas': fensino_2bas,
            'fensino_3bas': fensino_3bas,
            'fensino_sec': fensino_sec,
            'fensino_possec': fensino_possec,
            'reform': reform,
            'desemp': desemp
        }
        # print(sections[section_id])
        # print(section_id, fensino_1bas, fensino_2bas, fensino_3bas, fensino_sec, fensino_possec, reform, desemp, sep = ' - ')

    return sections
def add_previous_columns(drop_columns,periodicity_key, archive_name, test_size, historic_count,create_complete_csv):
    # read clean csv
    df = pandas.read_csv('../data/dataTemp.csv')

    # transform object to date
    df['cnv_date'] = pandas.to_datetime(df.cnv_date, infer_datetime_format=True)

    # order by date desc
    df = df.sort_values(by = 'cnv_date', ascending=0)

    # add time variable for historical values
    df[time_variables_columns(periodicity_key)] = previous_data_historic(periodicity_key,df)

    # create new data
    data = util_create_scenery(df,historic_count,drop_columns)

    # separate train and test data from an percentage parameter
    train, test = train_test_split(data, test_size=test_size)

    # create 2 csv files, from train and test
    test_path = '../data/results/' + archive_name + '_test' + '.csv'
    train_path = '../data/results/' + archive_name + '_train' + '.csv'

    train.to_csv(train_path,header=None,index=False)
    test.to_csv(test_path,header=None,index=False)

    # if you need complete data set
    if create_complete_csv:
        path = '../data/results/' + archive_name + '.csv'
        data.to_csv(path, header=None, index=False)

    print('*******Create Data Successfully*******')
    print('******* ' + archive_name + ' Done' + '*******')
Exemple #11
0
def upload_csv():
    try:
        file_name=''
        album_name=''  
        qib_name=''
        qib_description=''
        csv_type=''
        if request.form:
            album_name = request.form['album_name']
            qib_name = request.form['qib_name']
            qib_description = request.form['qib_description']
            file_name = request.form['file_name']
            csv_type = request.form['csv_type']
        if request.method == 'POST' and request.files:
            data = pd.read_csv(request.files[file_name])
            headers = data.columns.values.tolist()
            csv_check = check_headers(headers, csv_type)
            if csv_check == False:
                return jsonify('Invalid columns')
            else:
                if csv_type == 'Custom QIB':
                    load_custom_filter_csv_to_db(data,album_name, qib_name,qib_description) 
                if csv_type == 'New QIB':
                    load_file_to_db(data,album_name,qib_name,qib_description)
                if csv_type == 'Outcome list':
                    add_outcome(data)
        return jsonify('OK')
    except Exception:
        return jsonify(f"{err.__class__.__name__}: {err}")
Exemple #12
0
def main():
    print('carga de archivo')
    print('................')
    print('\n')
    dirname = os.path.dirname(__file__)
    filename = os.path.join(dirname, '../data/FormatoDeAlmacenamiento1.csv')
    df = pd.read_csv(filename, sep=';', header=None, na_values=" NaN")

    #print (df)
    hora = df[0].str.extract('((?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d)')
    #print (hora)
    #remplazo los na por cero
    df2 = pd.DataFrame()
    df2 = df2.fillna(0)
    df2[0] = df[0].str.extract('((?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d)')
    df2[1] = df[1]
    #print(df2)
    print("se agrupa por minutos")
    df2[0] = pd.DatetimeIndex(df2[0])
    df2.set_index(keys=0, inplace=True)
    ini = datetime.time(00, 18, 0)
    fin = datetime.time(23, 59, 0)
    df3 = df2[[1]].between_time(ini, fin)
    df3 = df3.groupby([1])[[1]].count()
    print(df3)
def prepare_dataset(dataset_path):
    '''  
    Read a comma separated text file where 
	- the first field is a ID number 
	- the second field is a class label 'B' or 'M'
	- the remaining fields are real-valued

    Return two numpy arrays X and y where 
	- X is two dimensional. X[i,:] is the ith example
	- y is one dimensional. y[i] is the class label of X[i,:]
          y[i] should be set to 1 for 'M', and 0 for 'B'

    @param dataset_path: full path of the dataset text file

    @return
	X,y
    '''

    #read the dataset
    dataset = pd.read_csv(dataset_path, header=None)
    X = dataset.drop(dataset.columns[1], axis=1)
    #standardizing X value for better prediction
    sc = StandardScaler()
    X = sc.fit_transform(X)
    #change type to string to make sure corrrect type
    y = dataset.iloc[:, 1].astype(str)
    #encode label result
    y = encode(y)
    print(X.shape)
    print(y.shape)
    return X, y
def get_recommendations(filename, recommendations_dict):
    # open for reading with "universal" type set
    doc = codecs.open(filename=filename, mode='rU',
                      encoding='UTF-8')
    raw_data = pd.read_csv(doc, sep='\r')

    return parse_recommendations(raw_data, recommendations_dict)
Exemple #15
0
    def __init__(self, board_file='default.csv', number_of_players=10):

        # class ,name, position, monopoly, monopoly_size, price, build_cost, rent, rent_house_1, rent_house_2, rent_house_3, rent_house_4, rent_hotel, default_income
        self.specs = pd.read_csv(board_file).to_dict('records')
        self.number_of_locations = len(self.specs)
        self.number_of_players = number_of_players
        self.token_emojis = [
            random_emoji() for i in range(0, number_of_players + 1)
        ]
Exemple #16
0
def save_data():
    df = pd.read_csv('./douban_book/comment_25984204.txt')
    df["sentiments"] = df["content"].map(
        lambda c: snownlp.SnowNLP(c).sentiments)
    engine = create_engine('mysql+pymysql://root:@127.0.0.1:3306/douban')
    df.to_sql(name='book',
              con=engine,
              chunksize=1000,
              if_exists='replace',
              index=None)
Exemple #17
0
def process_data_for_labels(ticker):
    hm_days = 7
    df = pd.read_csv('sp500_joined_closes.csv', index_col=0)
    tickers = df.columns.values.tolist()
    df.fillna(0, inplace=True)
    for i in range(1, hm_days + 1):
        df['{}_{}d'.format(
            ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
    df.fillna(0, inplace=True)
    return tickers, df
 def calculate_offset(self, sample_values="", nominal=0):
     if (sample_values != ""):
         d = pd.read_csv(StringIO(sample_values))
         self.raw_data = d
         d_u = d.loc[d['direction'] == 'up']
         d_d = d.loc[d['direction'] == 'down']
         mean_u = d_u.rotation.mean()
         mean_d = d_d.rotation.mean()
         self.offset = ((mean_d + mean_u) / 2) - nominal
         self.calibrated = True
Exemple #19
0
def load_sections(filename):
    filepath = basepath + filename
    df = pandas.read_csv(filepath)

    sections = {}
    for _, row in df.iterrows():
        section = Section(row['SEC11'], row['lat'], row['lon'])
        sections[row['SEC11']] = section

    return sections
    def activities(self, curr_location):
        # Extracting the CSVs for indoor information
        arena_import = pd.read_csv('Arenas.csv', index_col=0)
        arenas = self.coordinates(arena_import, 2, 7, 6, curr_location)

        library_import = pd.read_csv('Libraries.csv', index_col=0)
        libraries = self.coordinates(library_import, 2, 9, 8, curr_location)

        museums_import = pd.read_csv('Museums_and_Galleries.csv', index_col=0)
        museums = self.coordinates(museums_import, 2, 4, 3, curr_location)

        community_import = pd.read_csv('Recreation_and_Community_Centres.csv',
                                       index_col=0)
        communities = self.coordinates(community_import, 2, 4, 3,
                                       curr_location)

        # Extracting the CSVs for outdoor information
        beaches_import = pd.read_csv('Beaches.csv', index_col=0)
        beaches = self.coordinates(beaches_import, 2, 5, 4, curr_location)

        campground_import = pd.read_csv('Campgrounds.csv', index_col=0)
        camps = self.coordinates(campground_import, 1, 5, 4, curr_location)

        waterfalls_import = pd.read_csv('City_Waterfalls.csv', index_col=0)
        falls = self.coordinates(waterfalls_import, 3, 14, 13, curr_location)

        pads_import = pd.read_csv('Spray_Pads.csv', index_col=0)
        pads = self.coordinates(pads_import, 2, 9, 8, curr_location)

        # Sorting the Lists by distance and categorising the information
        indoor = [arenas, libraries, museums, communities]
        outdoor = [beaches, camps, falls, pads]

        for i in range(len(indoor)):
            indoor[i].sort(key=lambda item: item.get('Distance'))

        for i in range(len(outdoor)):
            outdoor[i].sort(key=lambda item: item.get('Distance'))

        # Weather Information API
        resp = requests.get(
            "https://api.darksky.net/forecast/8b486e7acbd606454f4a0f8f95b56886/43.263444914224245,-79.91824930126315"
        )

        data = resp.json()

        # Gets information on the temperature, current precipitation and predicted precipitation
        temp = data['currently']['apparentTemperature']
        precip = data['currently']['precipIntensity']
        predict_precip = data['currently']['precipProbability']

        return self.output(temp, precip, predict_precip, indoor, outdoor)
def RunForFiltersAndRuns(baseDir, numOfFP, hiddlayers, testSetSize,
                         filterRange, runsForFilter, runsForEnsemble):
    savepath = os.path.join(
        baseDir,
        'RunResults_FP' + str(numOfFP) + '_hidd' + str(hiddlayers) + '.csv')
    collist = ['FilterCount', 'RunNo', 'MAE'
               ] + ['AE' + str(i) for i in range(testSetSize)]
    if not os.path.exists(savepath):
        df = pd.DataFrame(columns=collist)
        df.to_csv(savepath)

    for k in filterRange:
        print('Replication count', str(k + 1))
        for i in range(runsForFilter):
            print('RUN', str(i + 1))
            df = pd.read_csv(savepath, index_col=0)
            if len(df[(df['FilterCount'] == k + 1)
                      & (df['RunNo'] == i + 1)]) > 0:
                print('Already computed. Continuing...')
                continue
            print('Computing...')
            pred = SubnetWrapper(maxC=4,
                                 maxO=4,
                                 maxH=0,
                                 num_of_fingerprints=numOfFP,
                                 replicationCount=k + 1,
                                 learn_atomtype_weights=True,
                                 learn_filter_contrib_weights=True,
                                 shareWeightsAcrossAtomTypes=True,
                                 activation=ActivationFunctions.tanh,
                                 regularizer=RegularizerWrapper(True, 0.001),
                                 hiddenLayer=hiddlayers,
                                 dropout=0.9,
                                 initializer=WeightInitializerWrapper(
                                     InitializerTypes.RandomNormal,
                                     rand_norm_stddev=0.0001),
                                 learningrate=0.001,
                                 runcnt=runsForEnsemble,
                                 tolerance=1e-3)
            mae, aes_for_this_run = pred.makeExtrapolatingPrdictions(
                testSetSize=testSetSize,
                splitIndices=[217, 247],
                max_epochs=10000,
                encoding1=encodings,
                energy1=energies,
                encoding2=enctest,
                energy2=engtest,
                batchsize=217,
                useBatchNorm=False,
                isVerbose=False)
            print('MAE for current run:', mae)
            df.loc[len(df)] = dict(
                zip(collist, [k + 1, i + 1, mae] + aes_for_this_run.tolist()))
            df.to_csv(savepath)
def read_data_set():
    data_frame = pandas.read_csv('../data/data.csv')

    # transform timestamp to datetime
    date_set = pandas.to_datetime(data_frame.date, unit='s')
    data_frame['cnv_date'] = date_set

    # create error_columns from mq135,7 and 2
    data_frame['mq2_error'] = data_frame.mq2 * 0.02
    data_frame['mq7_error'] = data_frame.mq7 * 0.02
    data_frame['mq135_error'] = data_frame.mq135 * 0.02
    return data_frame
Exemple #23
0
def readData(filename):
    if filename == "":
        raise FileNotFoundError("File not found")
    try:
        dataset = pd.read_csv(filename)
        X = dataset.iloc[:, :-1]
        y = dataset.iloc[:, -1]
    except:
        print("error")
        #throw to thecaller
    finally:
        return X, y
        print("complete")
def get_meanCV(file):
    CSV_file = pandas.read_csv(file)
    expt_samples = len(CSV_file)
    DNAs = 7
    replicates = expt_samples/DNAs

    if "A13" in CSV_file["Well"].values:
        plate_map = Container(None, _CONTAINER_TYPES['384-pcr'])
    else:
        plate_map = Container(None, _CONTAINER_TYPES['96-pcr'])

    start = 0
    replicate_locs = []
    for i in range (0,DNAs-1):
        loc = [plate_map.humanize(s) for s in range(start, start + replicates)]
        replicate_locs.append(loc)
        start += replicates

    DNA_Ct = []
    for h in replicate_locs:    
        for x in h:
            Replicate_Ct_DNA = []
            data_source = open(file)    
            replicate_locations = h
            for line in data_source:
                split_line=line.split(',')
                wellID=split_line[0]
                Ct=split_line[3]
                for w in replicate_locations:
                    if w == wellID:
                        try:
                            Replicate_Ct_DNA.append(float(Ct))
                        except:
                            Replicate_Ct_DNA.append(0.0)
        DNA_Ct.append(Replicate_Ct_DNA)

    percentageCV = []
    for n in DNA_Ct:
        try:
            percentageCV.append(((stats.pstdev(n)/stats.mean(n))*100))
        except ZeroDivisionError as err:
            percentageCV.append(0.0)
    meanCV = stats.mean(percentageCV)
    for n in DNA_Ct:
        line = []
        line.append(stats.mean(n))
        line.append(stats.pstdev(n))
        mean_SD.append(line)
    writer = csv.writer(open('./output/mean_SD.csv', 'w'))
    writer.writerows(mean_SD)
    return meanCV
Exemple #25
0
def init_population_census_2011(custom_origin_index=None):
    print("Initializing population")

    filepath = basepath + "pombal-detailed.csv"
    df = pandas.read_csv(filepath)

    # total = df.iloc[[47]]

    df = df.drop(df.index[47])

    ratio = get_resize_ratio()

    zones = df['Localidade'].tolist()
    population = Population()
    population.set_zones(zones)

    for index, row in df.iterrows():
        new_num_pop = 0
        for rng in AGE_RANGES:
            age_num = int(round(row[rng] * ratio, 0))
            new_num_pop += age_num
            df.loc[index, rng] = age_num
        df.loc[index, 'Total'] = new_num_pop

    for _, row in df.iterrows():
        lugar = row['Localidade']
        if custom_origin_index is not None:
            lugar = zones[custom_origin_index]
        for i in range(len(AGE_RANGES) - 2):
            key = AGE_RANGES[i]
            (a, b) = AGE_RANGES_NUM[i]
            num = row[key]
            population.add_batch_age_range(num, lugar, a, b)
        adult_range_num = row[len(AGE_RANGES)]
        senior_range_num = row[len(AGE_RANGES) + 1]
        adult_distribution = get_adult_age_distribution(adult_range_num)
        senior_distribution = get_senior_age_distribution(senior_range_num)
        for (a, b) in adult_distribution:
            num_age_range = adult_distribution[(a, b)]
            population.add_batch_age_range(num_age_range, lugar, a, b)
        for (a, b) in senior_distribution:
            num_age_range = senior_distribution[(a, b)]
            population.add_batch_age_range(num_age_range, lugar, a, b)

    population.get_stats().add_age_distribution_stats(
        0, population.get_population_age_distribution())
    print("Population initialized - Total population: {}".format(
        population.get_population_size()))
    population.get_stats().print_population_age_stats()

    return population
 def __init__(self):
     database1 = 'database.csv'
     database = pd.read_csv(database1)
     self.x = database[[u'Feature1', u'Feature2']]
     self.y1 = database.Target1
     self.clf1 = MLPClassifier(solver='lbfgs',
                               alpha=1e-5,
                               hidden_layer_sizes=(20, 20),
                               random_state=1)
     self.clf1.fit(self.x, self.y1)
     self.clf2 = MLPClassifier(solver='lbfgs',
                               alpha=1e-5,
                               hidden_layer_sizes=(20, 20),
                               random_state=1)
Exemple #27
0
    def read_schools(self):
        filepath = basepath + self.schools_list

        df = pandas.read_csv(filepath)
        
        for _, row in df.iterrows():
            lat = row['Lat']
            lon = row['Lon']
            name = row['School Name']
            school_type = row['School Type']
            zone = self.get_point_zone(lat, lon)
            if zone is not None:
                obj = {'name': name, 'zone': zone, 'type': school_type}
                self.schools[school_type].append(obj)
Exemple #28
0
    def read_workplaces(self):
        filepath = basepath + self.workplaces_list
        df = pandas.read_csv(filepath)

        for _, row in df.iterrows():
            lat = row['Lat']
            lon = row['Lon']
            name = row['Name']
            work_type = row['Work Type']
            size = row['Size']
            zone = self.get_point_zone(lat, lon)
            if zone is not None:
                obj = {'name': name, 'zone': zone, 'type': work_type, 'size': size}
                self.workplaces[work_type].append(obj)
Exemple #29
0
def read_summaries(district):
    """
    Read in summaries of inspection reports into a Pandas dataframe.
    """
    # Just use the columns we need
    cols = [
        "county", "licnum", "sitename", "streetaddy", "cityaddy", "zip",
        "inspnum", "insptype", "inspdispos", "inspdate", "totalvio", "highvio",
        "licid", "visitid"
    ]

    try:
        insp = pd.read_csv(
            district,
            usecols=[2, 4, 5, 6, 7, 8, 9, 12, 13, 14, 17, 18, 80, 81],
            names=cols,
            dtype=object,
            encoding="ISO-8859-1")
    except FileNotFoundError:
        msg = "Sorry, the csv file for" + district + "was not found."
        print(msg)
    else:
        #Clean up some of the data before storing it in the db
        insp.sitename = insp.sitename.str.title()
        insp.sitename = insp.sitename.str.replace('Mcdonald\'s', 'McDonald\'s')
        insp.sitename = insp.sitename.str.replace('Mcdonalds', 'McDonald\'s')
        insp.sitename = insp.sitename.str.replace('Bbq', 'BBQ')
        insp.sitename = insp.sitename.str.replace(r'\'S ', '\'s ')
        insp.streetaddy = insp.streetaddy.str.title()
        insp.streetaddy = insp.streetaddy.str.replace(' Sw ', ' SW ')
        insp.streetaddy = insp.streetaddy.str.replace(' Se ', ' SE ')
        insp.streetaddy = insp.streetaddy.str.replace(' Nw ', ' NW ')
        insp.streetaddy = insp.streetaddy.str.replace(' Ne ', ' NE ')
        insp.streetaddy = insp.streetaddy.str.replace(' Rd', ' Road')
        insp.streetaddy = insp.streetaddy.str.replace(' Sr ', ' State Road ')
        insp.streetaddy = insp.streetaddy.str.replace(' Ste ', ', Suite ')
        insp.streetaddy = insp.streetaddy.str.replace(r'(?<=[4-9])Th ', 'th ')
        insp.streetaddy = insp.streetaddy.str.replace(r'2Nd ', '2nd ')
        insp.streetaddy = insp.streetaddy.str.replace(r'3Rd ', '3rd ')
        insp.streetaddy = insp.streetaddy.str.replace(r' Us ', ' US ')
        insp.cityaddy = insp.cityaddy.str.title()
        insp = insp.applymap(lambda x: str(x).strip()
                             if len(str(x).strip()) else None)
        insp['visitid'] = insp['visitid'].apply(
            int)  # so it can be filtered against df
        insp.inspdate = pd.to_datetime(insp.inspdate)
        insp.inspdate = insp.inspdate.dt.strftime('%Y, %m, %d')

    return insp
def main():
    print ('carga de archivo')
    print ('................')
    print('\n')
    dirname = os.path.dirname(__file__)
    filename = os.path.join(dirname, '../data/FormatoDeAlmacenamiento1.csv')
    df = pd.read_csv(filename, sep=';', header=None, na_values=" NaN")

    print (df)
    #imprimo la informacion del archivo que cargo
    print (df.info())
    print ('\n'*2)
    print(df.describe())
    df.groupby(1)[2].count().plot(kind='pie',legend='Reverse')
    #limpio datos
    #
    print ('\n'*2)
def pget(newFile):
		#read from csv
		df = pd.read_csv(newFile)
		#drop NULL vakue from Answer column
		df.dropna(axis=0, subset=['Answer'],inplace = True)
		#function to check CamelCase
		def is_camel_case(s):
			if s != s.lower() and s != s.upper() and "_" not in s and sum(i.isupper() for i in s[1:-1]) == 2:
					return True
			return False
		#run is_camel_case function on column 'Row' in dataframe
		df['Row'] = df['Answer'].apply(is_camel_case)
		#drop row if Boolean False
		df.drop(df.loc[df['Row'] == False].index, inplace=True)
		#Save to new csv file
		newFileName = os.path.splitext(newFile)[0]
		pd.DataFrame.to_csv(df,newFileName + '_' + strftime('%Y-%m-%d') + ".csv",',')
Exemple #32
0
from pandas import pandas as pd

stEdges = pd.read_csv("cal.cedge.csv")
stNodes = pd.read_csv("cal.cnode.csv")

print "COLUMNS FOR EDGES: ", stEdges.columns
print "COLUMNS FOR NODES: ", stNodes.columns

startCoords = []
endCoords = []

nodes = stNodes.as_matrix()

for i, edge in stEdges.iterrows():
	# print edge
	# print edge['startID'], edge['endID']
	start = int(edge['startID'])
	end = int(edge['endID'])

	startCoords.append((float(nodes[start][2]), float(nodes[start][1])))
	endCoords.append((float(nodes[end][2]), float(nodes[end][1])))
	#print edge['NodeID']
	# which st['NodeID'] == startID

startCoords = pd.Series(startCoords, name='startCoords')
endCoords = pd.Series(endCoords, name='endCoords')
#print startCoords

df = pd.concat([stEdges['EdgeID'], startCoords, endCoords, stEdges['distance']], axis=1)
# print df
Exemple #33
0
 def from_csv(csv_string, column_types=None, stand_in_columns=None):
     df = pandas.read_csv(StringIO(csv_string), dtype=column_types)
     _add_stand_in_columns(df, stand_in_columns)
     return QFrame(df)
from pandas import pandas as pd
import collections, math, random, sys, time, datetime
from copy import deepcopy

#weights of different types of crimes
CRIME_TYPE_WEIGHTS = {'ROBBERY':5, 'SEX OFFENSES, FORCIBLE':6,'DRUG/NARCOTIC':2, 'KIDNAPPING':7, 'SEX OFFENSES, NON FORCIBLE':3, 'ASSAULT':9}

#number of regions to divide the city into for k-means clustering
NUM_REGIONS = 10

edges = pd.read_csv("trimmed_edges.csv")
#crimes = pd.read_csv("crimes_with_streets.csv")
crimes = pd.read_csv("mini_crimes_set.csv")
testCrimes = pd.read_csv("test_crime_data.csv")

#a dictionary from edgeIDs to CrimeStreetObjects
streets = {}

#a dictionary from crimeStreets to a list of known crimes read in from testCrimes (crime type, time)
knownCrimes = {}

def getDistance(a,b):
	return (a[0] - b[0])*(a[0] - b[0]) + (a[1] - b[1])*(a[1] - b[1])

def kmeans(crimes, K, maxIters):
	'''
    crimes: list of crime location and closest street pairs, ((lat, long), closestEdge)
    K: number of desired clusters. Assume that 0 < K <= |examples|.
    maxIters: maximum number of iterations to run for (you should terminate early if the algorithm converges).
    Return: (length K list of cluster centroids,
            list of assignments, (i.e. if crimes[i] belongs to centers[j], then assignments[i] = j)
Exemple #35
0
    def distFromStreet(self, loc):
    	slope = (self.end[0]-self.start[0]) / (self.end[1]-self.start[1])
    	perp_slope = -1/slope
    	b = self.start[1] - slope*self.start[0]
    	b2 = loc[1] - perp_slope*loc[0]
    	dist_lat = (b2 + b) / (slope - perp_slope)
    	dist_long = dist_lat * slope + b
    	# print dist_lat, dist_long
    	dist = math.sqrt((dist_lat-loc[0])**2 + (dist_long-loc[1])**2)
    	return dist

street = CrimeStreet(1,(1.0,1.0),(2.0,2.0),math.sqrt(2))
print street.distFromStreet((2,1))


df = pd.read_csv("crimes_sub2.csv")
print df.columns

edges = pd.read_csv("edgeLocs.csv")

# prune the edges outside of the bounds 37.5-38.5N, -122.729 & -121.888
trimmed_edges = []
fail_count = [0 for _ in range(4)]
for edge in edges.iterrows():
    e = edge[1]
    start = eval(e['startCoords'])
    end = eval(e['endCoords'])
    if start[1] > -121.888 or start[1] < -122.729: 
        fail_count[0] += 1
        continue
    if start[0] > 38.5 or start[0] < 37.5: 
import fileinput
from pandas import pandas as pd

filename = "trimmed_edges.csv"

edges = pd.read_csv(filename)

lines = tuple(open(filename, 'r'))

fout = open('sf_edges','w')

i = 0
line = lines[i]
fout.write(line)
for edge in edges.iterrows():
	i += 1
	e = edge[1]
	startCoords = eval(e['startCoords'])
	endCoords = eval(e['endCoords'])
	if startCoords[1] > -122.35 or startCoords[1] < -122.52: 
		continue
	if startCoords[0] > 37.835 or startCoords[0] < 37.7: # changing these numbers to be more refined to SF
		continue
	if endCoords[1] > -122.35 or endCoords[1] < -122.52:
		continue
	if endCoords[0] > 37.835 or endCoords[0] < 37.7:
		continue
	line = lines[i]
	fout.write(line)

fout.close()
Exemple #37
0
            return min([math.sqrt((self.end[0]-loc[0])**2 + (self.end[1]-loc[1])**2), \
                math.sqrt((self.start[0]-loc[0])**2 + (self.start[1]-loc[1])**2)])
        #print dist_lat, dist_long
    	dist = math.sqrt((dist_lat-loc[0])**2 + (dist_long-loc[1])**2)
    	return dist

# street = CrimeStreet(1,(1.0,1.0),(2.0,2.0),math.sqrt(2))
# print street.distFromStreet((2,1))
# st1 = CrimeStreet(8889, (37.707062, -121.93736299999999), (37.707069, -121.928421), 0.008942)
# st2 = CrimeStreet(8834, (37.735077000000004, -122.400658), (37.727768, -122.40138999999999), 0.007346)
# print 'FOR STREET 8889'
# print st1.distFromStreet((37.7783276318163, -122.426642472038))
# print 'FOR STREET 8834'
# print st2.distFromStreet((37.7783276318163, -122.426642472038))

edges = pd.read_csv("trimmed_edges.csv")
print 'finished reading trimmed_edges.csv'

streets = {}
for edge in edges.iterrows():
    e = edge[1]
    curr = CrimeStreet(e['EdgeID'], eval(e['startCoords']), eval(e['endCoords']), float(e['distance']))
    streets[e['EdgeID']] = curr

crime_data = pd.read_csv("crimes_sub2.csv")
print 'finished reading crimes_sub2.csv'

# print "COLUMNS FOR NODES: ", crime_data.columns
print crime_data.axes

cats = crime_data['Category']
             metrics.v_measure_score(labels, modelAlgo.labels_),
             metrics.adjusted_rand_score(labels, modelAlgo.labels_)
             ))

def EncodedColumnValues(documentDataFrame, columnToBeEncoded):
    """This function enumerates the string values of the columns to numbers"""
    modifiedDataFrame = documentDataFrame.copy()
    encodingTargetList = modifiedDataFrame[columnToBeEncoded].unique()
    convert_to_int = {strName: float(n) for n, strName in enumerate(encodingTargetList)}
    modifiedDataFrame["Target"+columnToBeEncoded] = modifiedDataFrame[columnToBeEncoded].replace(convert_to_int).astype(np.float64)
    return (modifiedDataFrame)

if __name__ == '__main__':
    totalTime = time()
    """Please change the below path to where you have stored the database in your machine"""
    documentDataFrame= ps.read_csv('/media/nandan/Store/Python Excersises/FinalProjectML/train.csv')
    trainDataFrameIndex, testDataFrameIndex= train_test_split(documentDataFrame.index, train_size = 0.7)
    trainDataFrame = documentDataFrame.iloc[trainDataFrameIndex]
    testDataFrame = documentDataFrame.iloc[testDataFrameIndex]
    modifiedtrainDataFrame = EncodedColumnValues(trainDataFrame, "Category")
    modifiedtestDataFrame = EncodedColumnValues(testDataFrame, "Category")
    
    featuresList = []
    featuresTestList = []
    modifiedtrainDataFrame = EncodedColumnValues(modifiedtrainDataFrame, "DayOfWeek")
    modifiedtestDataFrame = EncodedColumnValues(modifiedtestDataFrame, "DayOfWeek")
    modifiedtrainDataFrame = EncodedColumnValues(modifiedtrainDataFrame, "PdDistrict")
    modifiedtestDataFrame = EncodedColumnValues(modifiedtestDataFrame, "PdDistrict")
    
    featuresList.append(modifiedtrainDataFrame.columns[11])
    featuresList.append(modifiedtrainDataFrame.columns[7])
def main():
    parser = argparse.ArgumentParser(description="""Convert conllu to conll format""")
    parser.add_argument("--input", help="conllu file", default="../data/en-ud-dev.conllu")
    parser.add_argument("--lang")

    parser.add_argument("--posrules", help="head POS rules file", default="../data/posrules.tsv")
    parser.add_argument("--output", help="target file", default="testout.conllu")
    parser.add_argument("--parsing_strategy", choices=["rules", "pagerank", "adjacent"], default="pagerank")
    parser.add_argument(
        "--steps",
        choices=["twotags", "complete", "neighbors", "verbs", "function", "content", "headrule"],
        nargs="+",
        default=[""],
    )
    parser.add_argument("--reverse", action="store_true", default=True)
    parser.add_argument("--rule_backoff", choices=["cycle", "left", "right"], default="left")
    parser.add_argument("--ablation", choices=["pagerank", "2stepdecoding"], default="pagerank")

    args = parser.parse_args()

    if sys.version_info < (3, 0):
        print("Sorry, requires Python 3.x.")  # suggestion: install anaconda python
        sys.exit(1)

    headrules = pd.read_csv(args.posrules, "\t")
    cio = CoNLLReader()
    orig_treebank = cio.read_conll_u(args.input)
    ref_treebank = cio.read_conll_u(args.input)
    modif_treebank = []
    posbigramcounter, wordcounter = count_pos_bigrams(orig_treebank)
    functionlist = [x for x, y in wordcounter.most_common(100)]
    print(functionlist)
    fill_out_left_and_right_attach(posbigramcounter)
    if args.parsing_strategy == "pagerank":
        for o, ref in zip(orig_treebank, ref_treebank):
            s = copy.copy(o)
            s.remove_edges_from(s.edges())
            s.remove_node(
                0
            )  # From here and until tree reconstruction there is no symbolic root node, makes our life a bit easier

            if "twotags" in args.steps:
                s = map_to_two_tags(s, functionlist)
            if "complete" in args.steps:
                s = add_all_edges(s)
            if "neighbors" in args.steps:
                s = add_short_edges(s)
            if "verbs" in args.steps:
                s = add_verb_edges(s)
            if "function" in args.steps:
                s = manage_function_words(s)
            if "content" in args.steps:
                s = relate_content_words(s)
            if "headrule" in args.steps:
                s = add_head_rule_edges(s, headrules)
            tree_decoding_algorithm_content_and_function(s, headrules, args.reverse, args.ablation)
            modif_treebank.append(s)
            if args.reverse:
                r = ".rev"
            else:
                r = ".norev"
            outfile = Path(args.lang + "_" + args.output + "_" + "_".join(args.steps) + r + ".conllu")
            cio.write_conll(
                modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False
            )
            outfile = Path(args.lang + "_" + args.output)
            cio.write_conll(
                modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False
            )
    elif args.parsing_strategy == "adjacent":
        for s in orig_treebank:
            s.remove_edges_from(s.edges())
            s = attach_adjacent(s, args.rule_backoff)
            modif_treebank.append(s)
        outfile = Path(args.output + "." + args.rule_backoff)
        cio.write_conll(modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False)

    else:
        for s in orig_treebank:
            s = add_high_confidence_edges(s, posbigramcounter, args.rule_backoff)
            modif_treebank.append(s)

        for k in sorted(scorerdict.keys()):
            prec = sum([p for p, r in scorerdict[k]]) / len(scorerdict[k])
            reca = sum([r for p, r in scorerdict[k]]) / len(scorerdict[k])
            print("{0}, {1:.2f}, {2:.2f}".format(k, prec, reca))
        outfile = Path(args.output + ".rules")
        cio.write_conll(modif_treebank, outfile, conllformat="conllu", print_fused_forms=False, print_comments=False)
    elif 21 <= int(data) < 24:
        return 'F1' # 21:00-24:00'
    elif 0 <= int(data) < 4:
        return 'F2' # 00:00-04:00'
    else:
        return '-'

from pandas import pandas as pd
# from datetime import datetime as dt
# from random import random as rand
# from numpy.random import randn

print '1/8 - Loading data'
df = pd.read_csv('dataset.csv',
                 header=None,
                 parse_dates=[1],
                names=['UserID', 'DateTime', 'AntennaID'],
                infer_datetime_format=True)

temp = pd.DatetimeIndex(df['DateTime'])
df['Date'] = temp.date
df['Time'] = temp.time

"""
# Filtering User
df_user_1 = df.groupby(['UserID'], sort=False).agg({"Date": lambda x: x.nunique()})
df_user_2 = df_user_1[df_user_1['Date'] > 2]
df_user_3 = df_user_2['Date']
df_user_3.to_csv('output_df_user.csv')
df_user_list = pd.read_csv('output_df_user.csv',
                 header=None,
    return True


BeginList = []


for fx in fn.frames():
    if isFirstBeginner(fx['name']):
        BeginList.append(fx['name']+'\t'+str(NumLexU(fx['name']))+'\t_')

#print(len(BeginList))
print('\n'.join(sorted(BeginList)))


D={}
D['WHO']=['People']
D['WHAT']=['Event','Eventive_affecting']
D['WHERE']=['Locale']
D['WHY']=['Event','Eventive_affecting']
D['HOW']=['']


frames = pd.read_csv("../res/frametargetlexicon.tsv",sep="\t")

framenames=set(frames.framename)


#Find FN first begginers and see which ones yield a perfect match
#Find the second level children of the and repeat