Example #1
0
def joined_df():
    """
    Loop through all districts in the state to build the summaries
    dataframe.
    """

    district1 = f"ftp://dbprftp.state.fl.us/pub/llweb/1fdinspi.csv"
    district2 = f"ftp://dbprftp.state.fl.us/pub/llweb/2fdinspi.csv"
    district3 = f"ftp://dbprftp.state.fl.us/pub/llweb/3fdinspi.csv"
    district4 = f"ftp://dbprftp.state.fl.us/pub/llweb/4fdinspi.csv"
    district5 = f"ftp://dbprftp.state.fl.us/pub/llweb/5fdinspi.csv"
    district6 = f"ftp://dbprftp.state.fl.us/pub/llweb/6fdinspi.csv"
    district7 = f"ftp://dbprftp.state.fl.us/pub/llweb/7fdinspi.csv"

    all_districts = [
        district1, district2, district3, district4, district5, district6,
        district7
    ]

    for district in all_districts:
        insp_list = []
        insp = read_summaries(district)
        insp_list.append(insp)
        df_insp = pd.concat(insp_list, axis=0)

    return df_insp
Example #2
0
def save_num_cat(project_relative_root_path, local_project):

    url = project_relative_root_path + local_project + '_0_num.xlsx'
    df_num = pd.read_excel(url)

    url = project_relative_root_path + local_project + '_0_cat.xlsx'
    df_cat = pd.read_excel(url)

    delete_columns = [
        'StartDate', 'EndDate', 'Status', 'IPAddress', 'RecipientLastName',
        'RecipientFirstName', 'RecipientEmail', 'ExternalReference',
        'LocationLatitude', 'LocationLongitude', 'DistributionChannel',
        'UserLanguage', 'RecordedDate', 'ResponseId', 'Progress',
        'Duration (in seconds)', 'Finished'
    ]

    df_num = df_num.drop(columns=delete_columns, axis=1)
    df_cat = df_cat.drop(columns=delete_columns, axis=1)
    df_num = df_num.iloc[1:, ]
    df_cat = df_cat.iloc[1:, ]

    for column in df_cat:
        df_cat.rename(columns={column: (column + '_cat')}, inplace=True)

    df_all = pd.concat([df_num, df_cat], sort=False, axis=1)

    url = project_relative_root_path + local_project + '_0_all.csv'
    print(f'url (save 0_all.csv): {url}')
    df_all.to_csv(url, index=False, encoding='utf-8')
    print(df_all.shape)

    return df_all
Example #3
0
 def __init__(self):
     # Vamos a leer el conjunto de datos en un dataframe de pandas.
     df1 = pd.read_csv('static/data/DB_2009-2010.csv')
     df2 = pd.read_csv('static/data/DB_2010-2011.csv')
     self.dataF = pd.concat(
         [df1, df2])  #Juntar los datos para manejar una sola base de datos
     self.dataF = self.data()
     self.dataMonths = self.dataMonths()
     self.data_products = self.data_products()
     return
    def test_pandas_concatenate(self):

        d1 = DataFrame(data=[2, 4, 6, 8], columns=["A"], index=[1, 2, 3, 4])
        d2 = DataFrame(data=[[1, 1.1], [3, 3.3], [5, 5.5], [7, 7.7], [9, 9.9]], columns=["A", "B"], index=[1, 2, 3, 4, 5])

        result = pandas.concat([d1, d2], keys=[1, 2])

        self.assertEqual(result["A"][1][2], 4)
        self.assertEqual(result["A"][2][2], 3)
        self.assertTrue(numpy.isnan(result["B"][1][1]))
        self.assertFloatEqual(result["B"][2][4], 7.7)
Example #5
0
    def historicalDataEnd(self, idx: int, start: str, end: str):
        super().historicalDataEnd(idx, start, end)

        sym = self.id2hist[idx]['symbol']
        r = self.id2hist[idx]['data'].rename(sym).fillna(0)
        with threading.Lock():
            l = self.cache
            self.cache = pandas.concat([l, r], axis=1)

        self.logger.info('{}) {}'.format(len(self.id2hist), sym))
        self.id2hist.pop(idx, None)
Example #6
0
def combine_rankings(dflow, dfunq, scoring_func=None):
    d1 = apply_rank_agg(dflow, scoring_func=scoring_func)
    d2 = apply_rank_agg(dfunq, scoring_func=scoring_func)
    d3 = pd.concat([d1, d2], axis=1)
    d3.columns = pd.MultiIndex.from_product([('Low_card', 'Unique'), list(d1)])
    sort_cols = [
        ('Unique', 'Median'),
        ('Low_card', 'Median'),
        ('Unique', 'Fails'),
    ]
    return d3.drop(('Low_card', 'Fails'),
                   axis=1).sort_values(sort_cols, ascending=[1, 1, 1])
Example #7
0
    def test_pandas_concatenate(self):

        d1 = DataFrame(data=[2, 4, 6, 8], columns=["A"], index=[1, 2, 3, 4])
        d2 = DataFrame(data=[[1, 1.1], [3, 3.3], [5, 5.5], [7, 7.7], [9, 9.9]],
                       columns=["A", "B"],
                       index=[1, 2, 3, 4, 5])

        result = pandas.concat([d1, d2], keys=[1, 2])

        self.assertEqual(result["A"][1][2], 4)
        self.assertEqual(result["A"][2][2], 3)
        self.assertTrue(numpy.isnan(result["B"][1][1]))
        self.assertFloatEqual(result["B"][2][4], 7.7)
Example #8
0
 def process_directory(self, path="", skip_lines=0, colnames=[]):
     base_path = self.base_dir
     if path != "":
         base_path = os.path.join(base_path, path)
     files = ReadCsv.all_files(base_path)
     data = None
     all_paths = map(lambda file: os.path.join(base_path, file), files)
     for file_path in all_paths:
         temp = self.read_csv(file_path, skip_lines, colnames)
         if data is None:
             data = temp
         else:
             data = p.concat([data, temp])
     return data
Example #9
0
    def get_map2(self):
        df_production_map = self.data_access.get_df_produccion()
        df_fincas_map = self.data_access.get_df_finca()

        df_production_map_2 = df_production_map.groupby(
            'finca')['tallos_planta'].mean()
        df_production_map_2 = df_production_map_2.add_suffix('').reset_index()
        df_production_map_2 = df_production_map_2.sort_values(by=['finca'])

        df_fincas_map2 = df_fincas_map.sort_values(by=['finca']).reset_index()

        result = pd.concat([df_production_map_2, df_fincas_map2],
                           axis=1,
                           join='inner')
        result2 = result.loc[:, ~result.columns.duplicated()]
        return result2
Example #10
0
def eater(file, table_name):
    global tables
    with open(file, "r") as f:
        j = json.loads(f.read())
        if j["op"] == "c":
            basic_dict = {"id":j["id"], "ts":j["ts"]}
            basic_dict.update(j["data"])
            if tables[table_name].empty:
                tables[table_name] = pd.DataFrame(basic_dict, columns = basic_dict.keys(), index = [basic_dict["id"]])
            else:
                t_df = pd.DataFrame(basic_dict, columns = basic_dict.keys(), index = [basic_dict["id"]])
                tables[table_name] = pd.concat([tables[table_name], t_df])
        else:
            for k,v in j["set"].items():
                if k in tables[table_name].columns:
                    tables[table_name][k] = tables[table_name][k].where(tables[table_name]['id'] != j['id'], v)
                else:
                    tables[table_name][k] = [None]
                    tables[table_name][k] = tables[table_name][k].where(tables[table_name]['id'] != j['id'], v)
Example #11
0
# incNums = []
crime_to_st = []
crime_dist = []
for i, crime in crime_data.iterrows():
    #print crime[1]['Category']
    loc = eval(crime['Location'])
    # get the location of the crime (lat, long)
    st_of_crime = min([(streets[st].distFromStreet(loc), st) for st in streets])
    # min((CrimeStreet.dist(crime loc), CrimeStreet) for st in streets)
    streets[st_of_crime[1]].addCrime(crime['Category'])
    crime_to_st.append(st_of_crime[1])
    crime_dist.append(st_of_crime[0])
    # CS.addCrime(crime type)
    # print crime[1]['Category']
    # print crime
    # incNums.append(crime['IncidntNum'])


# starts = pd.Series([edge['startCoords'] for edge in trimmed_edges])
# ends = pd.Series([edge['endCoords'] for edge in trimmed_edges])
# dists = pd.Series([edge['distance'] for edge in trimmed_edges])
keys = ['Category', 'DayOfWeek', 'Date', 'Time', 'Location', 'StreetMatch', 'Distance']
crime_df = pd.concat([pd.Series(cats), pd.Series(days), pd.Series(dates), \
    pd.Series(times), pd.Series(locs), pd.Series(crime_to_st), pd.Series(crime_dist)], axis=1, keys=keys)
# crime_df.to_csv("crimes_with_streets.csv")

print 'finished matching crimes to streets'



Example #12
0
stEdges = pd.read_csv("cal.cedge.csv")
stNodes = pd.read_csv("cal.cnode.csv")

print "COLUMNS FOR EDGES: ", stEdges.columns
print "COLUMNS FOR NODES: ", stNodes.columns

startCoords = []
endCoords = []

nodes = stNodes.as_matrix()

for i, edge in stEdges.iterrows():
	# print edge
	# print edge['startID'], edge['endID']
	start = int(edge['startID'])
	end = int(edge['endID'])

	startCoords.append((float(nodes[start][2]), float(nodes[start][1])))
	endCoords.append((float(nodes[end][2]), float(nodes[end][1])))
	#print edge['NodeID']
	# which st['NodeID'] == startID

startCoords = pd.Series(startCoords, name='startCoords')
endCoords = pd.Series(endCoords, name='endCoords')
#print startCoords

df = pd.concat([stEdges['EdgeID'], startCoords, endCoords, stEdges['distance']], axis=1)
# print df

df.to_csv("edgeLocs.csv")
Example #13
0
from narratives.coded.parse import parse_atlas_output
from narratives.coded.convert import convert_doc2docx
from narratives.coded.transform import transform
from narratives.coded.process import process
from glob import glob
from pandas import pandas
import numpy
from numpy.random import choice
from collections import Counter
import feather

frames = []
for doc in glob('./data/*.doc'):
    convert_doc2docx(doc, './data/')
    frames.append(parse_atlas_output(doc + "x"))

data = pandas.concat(frames)
transformed_data = transform(data, 'code')
transformed_data['dataset'] = choice(['TRAIN','TEST'], len(transformed_data['segment']), p=(0.7, 0.3))
print(Counter(transformed_data['dataset']))

processed_data = process(transformed_data, 'segment')
print(processed_data)

feather.write_dataframe(processed_data, './data/coded_data.feather')
Example #14
0
    def neural_network():
        datos = self.data_access.get_df_produccion()
        datos.drop(['Bloque','Nave','Lado','Cama','Id Cama','Piloto/homogenea','Area','Suma de Indice tallos/M2','Suma de Indice tallos/planta','Notas'],axis=1,inplace=True)
        datos.rename({'Fecha Siembra':'fecha_siembra','Año Semana':'ano_semana','UP':'finca','Tipo':'tipo','Variedad':'variedad','Fecha siembra':'fecha_siembra','Concatenado':'concatenado','Tallos producidos':'tallos','Edad':'edad','Cantidad':'cantidad_plantas','Fiesta':'fiesta'},axis=1,inplace=True)
        datos['coeficiente']=datos['tallos']/datos['cantidad_plantas']
        datos['ano_semana']=datos['ano_semana'].astype(str)

        colores = self.data_access.get_df_variedad_color()
        colores.rename(columns={'Variedad':'variedad'},inplace=True)

        fechas= self.data_access.get_df_fechas()
        fechas['dia']=pd.to_datetime(fechas['dia'])
        semanas=fechas.groupby('ano_semana').max().reset_index()
        semanas['ano_semana']=semanas['ano_semana'].astype(str)

        estaciones = self.get_df_estacion()

        fincas=self.get_df_finca()
        fincas.rename(columns={'FINCAS':'nombre','SIGLA':'sigla','LATITUD':'latitud','LONGITUD':'longitud'},inplace=True)
        finca_estac = pd.DataFrame()

        for i in list(fincas.sigla.unique()):
            temp= fincas[fincas['sigla']==i]
            temp=pd.concat([temp,estaciones],ignore_index=True)
            temp['nombre']=temp.iloc[0,0]
            temp['sigla']=temp.iloc[0,1]
            temp['latitud']=temp.iloc[0,2]
            temp['longitud']=temp.iloc[0,3]
            temp.dropna(inplace=True)
            temp['distancia'] = np.sqrt((temp['LATITUD'] - temp['latitud'])**2 + (temp['LONGITUD'] - temp['longitud'])**2)
            temp.sort_values(['sigla','distancia'],ignore_index=True,inplace=True)
            temp=temp.head(1)
            finca_estac=pd.concat([finca_estac,temp])


        finca_estac=finca_estac.reset_index(drop=True)

        #Code for calculation of average points for every farm-variety-age-week
        promedio=pd.DataFrame(columns=('ansema','up','variedad','edad','indiceplan'))
        datos.ano_semana=datos.ano_semana.astype(int)
        for i in list(datos['ano_semana'].sort_values().unique()):
            desde = i-199
            datos_filt=datos[(datos['ano_semana']>=desde) & (datos['ano_semana']<i-4)].copy()
  
            curva_promed=datos_filt.groupby(['finca','variedad','edad'])['coeficiente'].mean().reset_index()
            curva_promed['ano_semana'] = i
            curva_promed['ano_semana'] = (curva_promed['ano_semana']).astype(str)  
            promedio=pd.concat([promedio,curva_promed],ignore_index=True)
            promedio=promedio.set_index(['ano_semana','finca','variedad','edad']).to_dict('index')

        def curva_promedio(ansem,up,variedad,edad):
            try:
              valor=promedio[ansem,up,variedad,edad]['coeficiente']
              return valor
            except:
            return 0
        datos.ano_semana=datos.ano_semana.astype(str)

        datos=datos.merge(semanas,on='ano_semana',how='left')
        datos['mes_dato']=datos.dia.dt.month     

        recons=datos.sort_values(['concatenado','edad']).reset_index(drop=True)

        lag_prod=10

        for i in tqdm(range(1,lag_prod+1)):
            strprod=str(i)+'sem_atras'
            strconc=str(i)+'concat_atras'
            recons[strprod]=recons['coeficiente'].shift(i)
            recons[strconc]=recons['concatenado'].shift(i)
            vald=str(i)+'valido'
            recons[vald]=recons.apply(lambda ff: 1 if ff[strconc]==ff['concatenado'] else 0,axis=1)
            recons[strprod]=recons.apply(lambda x: x[strprod] if x[vald]==1 else 0,axis=1)
            recons.drop(columns={strconc},inplace=True)
            recons.drop(columns={vald},inplace=True)

        recons.drop(columns={'Unnamed: 0'},inplace=True)
        recons=recons.merge(colores[['variedad','Color']],how='left',on='variedad')

        #agrega la columna de curva estandar para cada variedad-finca
        recons.ano_semana=(recons.ano_semana).astype(str)
        recons['curva_metodo_finca'] = recons.apply(lambda x: curva_promedio(x['ano_semana'],x['finca'],x['variedad'],x['edad']),axis=1)
        recons=recons[recons['tipo'].isin(['Minicarnation','Carnation'])]
        recons.Color.fillna('NoColor',inplace=True)
        recons['edad^2']=recons['edad']**2
        recons['edad^3']=recons['edad']**3


        #Red Neuronal
        consolidado_rn=pd.DataFrame()
        from sklearn.preprocessing import StandardScaler
        from tensorflow import keras
        from tensorflow.keras import layers
        recons=recons[recons['dia']>='01/01/2018']
        y_hat_rn=pd.Series(name='y_hat_falso')

        for i in recons.tipo.unique():
            for j in recons_test[recons_test['tipo']==i]['Color'].unique():
                temp_test=recons_test[(recons_test['tipo']==i)&(recons_test['Color']==j)]
                df_clean_test=pd.concat([temp_test[['edad','edad^2','edad^3','mes_dato','5sem_atras',
                                '6sem_atras','7sem_atras','8sem_atras','9sem_atras','10sem_atras',
                                 #'11sem_atras','12sem_atras','13sem_atras','14sem_atras','15sem_atras',
                                  'curva_metodo_finca','coeficiente']], pd.get_dummies(temp_test['variedad']), pd.get_dummies(temp_test['finca'])], axis=1)
                df_clean_test.fillna(value=0,inplace=True)
                y_real_test = df_clean_test.coeficiente
                X_real_test = df_clean_test.drop('coeficiente', axis=1)

                temp=recons[(recons['tipo']==i)&(recons['Color']==j)]
                temp=temp[temp['variedad'].isin(temp_test['variedad'].unique())]
                temp=temp[temp['finca'].isin(temp_test['finca'].unique())]
                df_clean=pd.concat([temp[['edad','edad^2','edad^3','mes_dato','5sem_atras',
                                 '6sem_atras','7sem_atras','8sem_atras','9sem_atras','10sem_atras',
                                  #'11sem_atras','12sem_atras','13sem_atras','14sem_atras','15sem_atras',
                                  'curva_metodo_finca','coeficiente']], pd.get_dummies(temp['variedad']), pd.get_dummies(temp['finca'])], axis=1)

                df_clean.fillna(value=0,inplace=True)
                y = df_clean.coeficiente
                X = df_clean.drop('coeficiente', axis=1)
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
                scaler = StandardScaler()
                X_train_std = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
                neurons = 256
                model = keras.Sequential([layers.Dense(neurons, activation='relu', input_shape=[len(X_train_std.columns)]),
                                  layers.Dense(neurons,activation='relu'),
                                  layers.Dense(1,activation='relu')])   #Capa salida
                model.compile(loss='mse', optimizer = 'adam')
                history = model.fit(X_train_std, y_train, epochs=100, validation_split = 0.2, verbose=0,batch_size=100)

                X_norm = scaler.transform(X_real_test)
                indice=X_real_test.reset_index()['index']
                y_hat=model.predict(X_norm)
                y_hat=pd.Series(y_hat[0:,0],name='y_hat')
                y_hat.index=X_real_test.index
                y_hat_rn=pd.concat([y_hat_rn,y_hat],axis=1)

        y_hat_rn.drop(columns={'y_hat_falso'},inplace=True)
        ser_y_hat=np.sum(y_hat_rn,axis=1)
        y_hat_rn['y_hat_red_n']=ser_y_hat
        validacion_y_hat=y_hat_rn[['y_hat_red_n']]
        validacion_final=pd.concat([recons_test,validacion_y_hat],axis=1)
                          
Example #15
0
def replace_and_concat(column_name,df):
    replacement = pandas.get_dummies(df[column_name],prefix=column_name)
    replacement = replacement.set_index(df.index)
    df.drop(column_name, axis=1, inplace=True)
    df = pandas.concat([df,replacement],axis=1)
    return df
Example #16
0
        def curva_promedio(ansem,up,variedad,edad):
            try:
              valor=promedio[ansem,up,variedad,edad]['coeficiente']
              return valor
            except:
            return 0
        datos.ano_semana=datos.ano_semana.astype(str)

        datos=datos.merge(semanas,on='ano_semana',how='left')
        datos['mes_dato']=datos.dia.dt.month     

        recons=datos.sort_values(['concatenado','edad']).reset_index(drop=True)

        lag_prod=10

        for i in tqdm(range(1,lag_prod+1)):
            strprod=str(i)+'sem_atras'
            strconc=str(i)+'concat_atras'
            recons[strprod]=recons['coeficiente'].shift(i)
            recons[strconc]=recons['concatenado'].shift(i)
            vald=str(i)+'valido'
            recons[vald]=recons.apply(lambda ff: 1 if ff[strconc]==ff['concatenado'] else 0,axis=1)
            recons[strprod]=recons.apply(lambda x: x[strprod] if x[vald]==1 else 0,axis=1)
            recons.drop(columns={strconc},inplace=True)
            recons.drop(columns={vald},inplace=True)

        recons.drop(columns={'Unnamed: 0'},inplace=True)
        recons=recons.merge(colores[['variedad','Color']],how='left',on='variedad')

        #agrega la columna de curva estandar para cada variedad-finca
        recons.ano_semana=(recons.ano_semana).astype(str)
        recons['curva_metodo_finca'] = recons.apply(lambda x: curva_promedio(x['ano_semana'],x['finca'],x['variedad'],x['edad']),axis=1)
        recons=recons[recons['tipo'].isin(['Minicarnation','Carnation'])]
        recons.Color.fillna('NoColor',inplace=True)
        recons['edad^2']=recons['edad']**2
        recons['edad^3']=recons['edad']**3


        #Red Neuronal
        consolidado_rn=pd.DataFrame()
        from sklearn.preprocessing import StandardScaler
        from tensorflow import keras
        from tensorflow.keras import layers
        recons=recons[recons['dia']>='01/01/2018']
        y_hat_rn=pd.Series(name='y_hat_falso')

        for i in recons.tipo.unique():
            for j in recons_test[recons_test['tipo']==i]['Color'].unique():
                temp_test=recons_test[(recons_test['tipo']==i)&(recons_test['Color']==j)]
                df_clean_test=pd.concat([temp_test[['edad','edad^2','edad^3','mes_dato','5sem_atras',
                                '6sem_atras','7sem_atras','8sem_atras','9sem_atras','10sem_atras',
                                 #'11sem_atras','12sem_atras','13sem_atras','14sem_atras','15sem_atras',
                                  'curva_metodo_finca','coeficiente']], pd.get_dummies(temp_test['variedad']), pd.get_dummies(temp_test['finca'])], axis=1)
                df_clean_test.fillna(value=0,inplace=True)
                y_real_test = df_clean_test.coeficiente
                X_real_test = df_clean_test.drop('coeficiente', axis=1)

                temp=recons[(recons['tipo']==i)&(recons['Color']==j)]
                temp=temp[temp['variedad'].isin(temp_test['variedad'].unique())]
                temp=temp[temp['finca'].isin(temp_test['finca'].unique())]
                df_clean=pd.concat([temp[['edad','edad^2','edad^3','mes_dato','5sem_atras',
                                 '6sem_atras','7sem_atras','8sem_atras','9sem_atras','10sem_atras',
                                  #'11sem_atras','12sem_atras','13sem_atras','14sem_atras','15sem_atras',
                                  'curva_metodo_finca','coeficiente']], pd.get_dummies(temp['variedad']), pd.get_dummies(temp['finca'])], axis=1)

                df_clean.fillna(value=0,inplace=True)
                y = df_clean.coeficiente
                X = df_clean.drop('coeficiente', axis=1)
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
                scaler = StandardScaler()
                X_train_std = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
                neurons = 256
                model = keras.Sequential([layers.Dense(neurons, activation='relu', input_shape=[len(X_train_std.columns)]),
                                  layers.Dense(neurons,activation='relu'),
                                  layers.Dense(1,activation='relu')])   #Capa salida
                model.compile(loss='mse', optimizer = 'adam')
                history = model.fit(X_train_std, y_train, epochs=100, validation_split = 0.2, verbose=0,batch_size=100)

                X_norm = scaler.transform(X_real_test)
                indice=X_real_test.reset_index()['index']
                y_hat=model.predict(X_norm)
                y_hat=pd.Series(y_hat[0:,0],name='y_hat')
                y_hat.index=X_real_test.index
                y_hat_rn=pd.concat([y_hat_rn,y_hat],axis=1)

        y_hat_rn.drop(columns={'y_hat_falso'},inplace=True)
        ser_y_hat=np.sum(y_hat_rn,axis=1)
        y_hat_rn['y_hat_red_n']=ser_y_hat
        validacion_y_hat=y_hat_rn[['y_hat_red_n']]
        validacion_final=pd.concat([recons_test,validacion_y_hat],axis=1)
                          
Example #17
0
    errors='ignore')

# clean and transform agent records
for row in jdata:

    row['agentActiveYr'] = row['reportYear']
    row['agentTitle'] = 'REGISTERED AGENT'

    # remove unwanted officers
    row.pop('officersList', None)

# export agent records from in-memory python objects to dataframes
df_agents = json_normalize(jdata, None, errors='ignore')

# combine agents and offices into a single dataframe set
df = pd.concat([df_agents, df_officers])

# trim all fields on all rows
df = trimAllColumns(df)
df = removePeriodsFromAllColumns(df)

# remove duplicates
groupby_list = list(set(df.columns) - set(['agentTitle']))
df = df.groupby(groupby_list).agg({
    'agentTitle': combineRows,
})

# remove multi-level index prior to JSON export
df = df.reset_index()
df = df.sort_index(axis=1)
df = df.sort_values(['taxpayerId'])
Example #18
0
    def run(self,
            surface_only=True,
            improvements_only=True,
            progress=True,
            view=None):
        """Run the differential flux variability analysis.

        Parameters
        ----------
        surface_only : bool, optional
            If only the surface of the n-dimensional production envelope should be scanned (defaults to True).
        improvements_only : bool, optional
            If only grid points should should be scanned that constitute and improvement in production
            over the reference state (defaults to True).
        progress : bool, optional
            If a progress bar should be shown.
        view : SequentialView or MultiprocessingView or ipython.cluster.DirectView, optional
            A parallelization view (defaults to SequentialView).

        Returns
        -------
        pandas.Panel
            A pandas Panel containing a results DataFrame for every grid point scanned.
        """
        with TimeMachine() as tm:
            # Make sure that the design_space_model is initialized to its original state later
            for variable in self.variables:
                reaction = self.design_space_model.reactions.get_by_id(
                    variable)
                tm(do=int,
                   undo=partial(setattr, reaction, 'lower_bound',
                                reaction.lower_bound))
                tm(do=int,
                   undo=partial(setattr, reaction, 'upper_bound',
                                reaction.upper_bound))
            target_reaction = self.design_space_model.reactions.get_by_id(
                self.objective)
            tm(do=int,
               undo=partial(setattr, target_reaction, 'lower_bound',
                            target_reaction.lower_bound))
            tm(do=int,
               undo=partial(setattr, target_reaction, 'upper_bound',
                            target_reaction.upper_bound))

            if view is None:
                view = config.default_view
            else:
                view = view

            included_reactions = [
                reaction.id for reaction in self.reference_model.reactions
                if reaction.id not in self.exclude
            ] + self.variables + [self.objective]

            self.reference_flux_dist = pfba(self.reference_model,
                                            fraction_of_optimum=0.99)

            self.reference_flux_ranges = flux_variability_analysis(
                self.reference_model,
                reactions=included_reactions,
                view=view,
                remove_cycles=False,
                fraction_of_optimum=0.75).data_frame

            self._init_search_grid(surface_only=surface_only,
                                   improvements_only=improvements_only)

            func_obj = _DifferentialFvaEvaluator(self.design_space_model,
                                                 self.variables,
                                                 self.objective,
                                                 included_reactions)
            if progress:
                progress = ProgressBar(len(self.grid))
                results = list(
                    progress(view.imap(func_obj, self.grid.iterrows())))
            else:
                results = list(view.map(func_obj, self.grid.iterrows()))

        solutions = dict((tuple(point.iteritems()), fva_result)
                         for (point, fva_result) in results)
        reference_intervals = self.reference_flux_ranges[[
            'lower_bound', 'upper_bound'
        ]].values
        for sol in six.itervalues(solutions):
            intervals = sol[['lower_bound', 'upper_bound']].values
            gaps = [
                self._interval_gap(interval1, interval2) for interval1,
                interval2 in my_zip(reference_intervals, intervals)
            ]
            sol['gaps'] = gaps
            if self.normalize_ranges_by is not None:
                normalizer = sol.lower_bound[self.normalize_ranges_by]
                if normalizer > non_zero_flux_threshold:
                    normalized_intervals = sol[['lower_bound', 'upper_bound'
                                                ]].values / normalizer

                    sol['normalized_gaps'] = [
                        self._interval_gap(interval1, interval2)
                        for interval1, interval2 in my_zip(
                            reference_intervals, normalized_intervals)
                    ]
                else:
                    sol['normalized_gaps'] = [numpy.nan] * len(sol.lower_bound)
            else:
                sol['normalized_gaps'] = gaps

        ref_upper_bound = self.reference_flux_ranges.upper_bound.apply(
            lambda v: 0 if abs(v) < non_zero_flux_threshold else v)
        ref_lower_bound = self.reference_flux_ranges.lower_bound.apply(
            lambda v: 0 if abs(v) < non_zero_flux_threshold else v)

        collection = list()
        for key, df in six.iteritems(solutions):
            df['biomass'] = key[0][1]
            df['production'] = key[1][1]

            df['KO'] = False
            df['flux_reversal'] = False
            df['suddenly_essential'] = False
            df['free_flux'] = False

            df.loc[(df.lower_bound == 0) & (df.upper_bound == 0) &
                   (ref_upper_bound != 0) & (ref_lower_bound != 0),
                   'KO'] = True

            df.loc[((ref_upper_bound < 0) & (df.lower_bound > 0) |
                    ((ref_lower_bound > 0) & (df.upper_bound < 0))),
                   'flux_reversal'] = True

            df.loc[((df.lower_bound <= 0) & (df.lower_bound > 0)) |
                   ((ref_lower_bound >= 0) & (df.upper_bound <= 0)),
                   'suddenly_essential'] = True

            is_reversible = numpy.asarray([
                self.design_space_model.reactions.get_by_id(i).reversibility
                for i in df.index
            ],
                                          dtype=bool)
            not_reversible = numpy.logical_not(is_reversible)

            df.loc[((df.lower_bound == -1000) &
                    (df.upper_bound == 1000) & is_reversible) |
                   ((df.lower_bound == 0) &
                    (df.upper_bound == 1000) & not_reversible) |
                   ((df.lower_bound == -1000) &
                    (df.upper_bound == 0) & not_reversible),
                   'free_flux'] = True

            df['reaction'] = df.index
            df['excluded'] = df['reaction'].isin(self.exclude)

            collection.append(df)


#        multi_index = [(key[0][1], key[1][1]) for key in solutions]
#        solutions_multi_index = pandas.concat(list(solutions.values()),
# axis=0, keys=multi_index)#
#        solutions_multi_index.index.set_names(['biomass', 'production',
# 'reaction'], inplace=True)
        total = pandas.concat(collection, ignore_index=True, copy=False)
        total.sort_values(['biomass', 'production', 'reaction'], inplace=True)
        total.index = total['reaction']
        return DifferentialFVAResult(total, self.envelope,
                                     self.reference_flux_ranges,
                                     self.reference_flux_dist)
Example #19
0
        fail_count[2] += 1
        continue
    if end[0] > 38.5 or end[0] < 37.5: 
        fail_count[3] += 1
        continue
    trimmed_edges.append(e)
print fail_count
print len(trimmed_edges)
# print trimmed_edges

edgeIDs = pd.Series([edge['EdgeID'] for edge in trimmed_edges])
starts = pd.Series([edge['startCoords'] for edge in trimmed_edges])
ends = pd.Series([edge['endCoords'] for edge in trimmed_edges])
dists = pd.Series([edge['distance'] for edge in trimmed_edges])

trimmed_df = pd.concat([edgeIDs, starts, ends, dists], axis=1, keys=['EdgeID', 'startCoords', 'endCoords', 'distance'])
# trimmed_df = pd.concat(trimmed_edges, axis=0, keys = [edge['EdgeID'] for edge in trimmed_edges])
#print trimmed_df

trimmed_df.to_csv("trimmed_edges.csv")

# cats = df['Category']
# print type(cats).__name__
# print len(cats)

# for i, crime in df.iterrows():
	# find the minimum (distFromStreet, CrimeStreet) pair
	# add the crime to that street


    def run(self, surface_only=True, improvements_only=True, progress=True, view=None):
        """Run the differential flux variability analysis.

        Parameters
        ----------
        surface_only : bool, optional
            If only the surface of the n-dimensional production envelope should be scanned (defaults to True).
        improvements_only : bool, optional
            If only grid points should should be scanned that constitute and improvement in production
            over the reference state (defaults to True).
        progress : bool, optional
            If a progress bar should be shown.
        view : SequentialView or MultiprocessingView or ipython.cluster.DirectView, optional
            A parallelization view (defaults to SequentialView).

        Returns
        -------
        pandas.Panel
            A pandas Panel containing a results DataFrame for every grid point scanned.
        """
        with TimeMachine() as tm:
            # Make sure that the design_space_model is initialized to its original state later
            for variable in self.variables:
                reaction = self.design_space_model.reactions.get_by_id(variable)
                tm(do=int, undo=partial(setattr, reaction, 'lower_bound', reaction.lower_bound))
                tm(do=int, undo=partial(setattr, reaction, 'upper_bound', reaction.upper_bound))
            target_reaction = self.design_space_model.reactions.get_by_id(self.objective)
            tm(do=int, undo=partial(setattr, target_reaction, 'lower_bound', target_reaction.lower_bound))
            tm(do=int, undo=partial(setattr, target_reaction, 'upper_bound', target_reaction.upper_bound))

            if view is None:
                view = config.default_view
            else:
                view = view

            included_reactions = [reaction.id for reaction in self.reference_model.reactions if
                                  reaction.id not in self.exclude] + self.variables + [self.objective]

            self.reference_flux_dist = pfba(self.reference_model, fraction_of_optimum=0.99)

            self.reference_flux_ranges = flux_variability_analysis(self.reference_model, reactions=included_reactions,
                                                                   view=view, remove_cycles=False,
                                                                   fraction_of_optimum=0.75).data_frame

            self._init_search_grid(surface_only=surface_only, improvements_only=improvements_only)

            func_obj = _DifferentialFvaEvaluator(self.design_space_model, self.variables, self.objective,
                                                 included_reactions)
            if progress:
                progress = ProgressBar(len(self.grid))
                results = list(progress(view.imap(func_obj, self.grid.iterrows())))
            else:
                results = list(view.map(func_obj, self.grid.iterrows()))

        solutions = dict((tuple(point.iteritems()), fva_result) for (point, fva_result) in results)
        reference_intervals = self.reference_flux_ranges[['lower_bound', 'upper_bound']].values
        for sol in six.itervalues(solutions):
            intervals = sol[['lower_bound', 'upper_bound']].values
            gaps = [self._interval_gap(interval1, interval2) for interval1, interval2 in
                    my_zip(reference_intervals, intervals)]
            sol['gaps'] = gaps
            if self.normalize_ranges_by is not None:
                normalizer = sol.lower_bound[self.normalize_ranges_by]
                if normalizer > non_zero_flux_threshold:
                    normalized_intervals = sol[['lower_bound', 'upper_bound']].values / normalizer

                    sol['normalized_gaps'] = [self._interval_gap(interval1, interval2) for interval1, interval2 in
                                              my_zip(reference_intervals, normalized_intervals)]
                else:
                    sol['normalized_gaps'] = [numpy.nan] * len(sol.lower_bound)
            else:
                sol['normalized_gaps'] = gaps

        ref_upper_bound = self.reference_flux_ranges.upper_bound.apply(
            lambda v: 0 if abs(v) < non_zero_flux_threshold else v)
        ref_lower_bound = self.reference_flux_ranges.lower_bound.apply(
            lambda v: 0 if abs(v) < non_zero_flux_threshold else v)

        collection = list()
        for key, df in six.iteritems(solutions):
            df['biomass'] = key[0][1]
            df['production'] = key[1][1]

            df['KO'] = False
            df['flux_reversal'] = False
            df['suddenly_essential'] = False
            df['free_flux'] = False

            df.loc[
                (df.lower_bound == 0) & (
                    df.upper_bound == 0) & (
                        ref_upper_bound != 0) & (
                            ref_lower_bound != 0),
                'KO'
            ] = True

            df.loc[
                ((ref_upper_bound < 0) & (df.lower_bound > 0) | (
                    (ref_lower_bound > 0) & (df.upper_bound < 0))),
                'flux_reversal'
            ] = True

            df.loc[
                ((df.lower_bound <= 0) & (df.lower_bound > 0)) | (
                    (ref_lower_bound >= 0) & (df.upper_bound <= 0)),
                'suddenly_essential'
            ] = True

            is_reversible = numpy.asarray([
                self.design_space_model.reactions.get_by_id(i).reversibility
                for i in df.index], dtype=bool)
            not_reversible = numpy.logical_not(is_reversible)

            df.loc[
                ((df.lower_bound == -1000) & (df.upper_bound == 1000) & is_reversible) | (
                    (df.lower_bound == 0) & (df.upper_bound == 1000) & not_reversible) | (
                        (df.lower_bound == -1000) & (df.upper_bound == 0) & not_reversible),
                'free_flux'
            ] = True

            df['reaction'] = df.index
            df['excluded'] = df['reaction'].isin(self.exclude)

            collection.append(df)

#        multi_index = [(key[0][1], key[1][1]) for key in solutions]
#        solutions_multi_index = pandas.concat(list(solutions.values()),
        # axis=0, keys=multi_index)#
#        solutions_multi_index.index.set_names(['biomass', 'production',
        # 'reaction'], inplace=True)
        total = pandas.concat(collection, ignore_index=True, copy=False)
        total.sort_values(['biomass', 'production', 'reaction'], inplace=True)
        total.index = total['reaction']
        return DifferentialFVAResult(total, self.envelope, self.reference_flux_ranges, self.reference_flux_dist)
Example #21
0
    def run(self,
            surface_only=True,
            improvements_only=True,
            progress=True,
            view=None,
            fraction_of_optimum=1.0):
        """Run the differential flux variability analysis.

        Parameters
        ----------
        surface_only : bool, optional
            If only the surface of the n-dimensional production envelope should be scanned (defaults to True).
        improvements_only : bool, optional
            If only grid points should should be scanned that constitute and improvement in production
            over the reference state (defaults to True).
        progress : bool, optional
            If a progress bar should be shown.
        view : SequentialView or MultiprocessingView or ipython.cluster.DirectView, optional
            A parallelization view (defaults to SequentialView).
        fraction_of_optimum : float, optional
            A value between zero and one that determines the width of the
            flux ranges of the reference solution. The lower the value,
            the larger the ranges.

        Returns
        -------
        pandas.Panel
            A pandas Panel containing a results DataFrame for every grid point scanned.
        """
        # Calculate the reference state.
        self.reference_flux_dist = pfba(
            self.reference_model, fraction_of_optimum=fraction_of_optimum)

        self.reference_flux_ranges = flux_variability_analysis(
            self.reference_model,
            reactions=self.included_reactions,
            view=view,
            remove_cycles=False,
            fraction_of_optimum=fraction_of_optimum).data_frame
        self.reference_flux_ranges[
            self.reference_flux_ranges.abs() < non_zero_flux_threshold] = 0.0
        reference_intervals = self.reference_flux_ranges.loc[
            self.included_reactions, ['lower_bound', 'upper_bound']].values

        if self.normalize_ranges_by is not None:
            logger.debug(
                self.reference_flux_ranges.loc[self.normalize_ranges_by, ])
            # The most obvious flux to normalize by is the biomass reaction
            # flux. This is probably always greater than zero. Just in case
            # the model is defined differently or some other normalizing
            # reaction is chosen, we use the absolute value.
            norm = abs(self.reference_flux_ranges.at[self.normalize_ranges_by,
                                                     "lower_bound"])
            if norm > non_zero_flux_threshold:
                normalized_reference_intervals = reference_intervals / norm
            else:
                raise ValueError(
                    "The reaction that you have chosen for normalization '{}' "
                    "has zero flux in the reference state. Please choose another "
                    "one.".format(self.normalize_ranges_by))

        with TimeMachine() as tm:
            # Make sure that the design_space_model is initialized to its original state later
            for variable in self.variables:
                reaction = self.design_space_model.reactions.get_by_id(
                    variable)
                tm(do=int,
                   undo=partial(setattr, reaction, 'lower_bound',
                                reaction.lower_bound))
                tm(do=int,
                   undo=partial(setattr, reaction, 'upper_bound',
                                reaction.upper_bound))
            target_reaction = self.design_space_model.reactions.get_by_id(
                self.objective)
            tm(do=int,
               undo=partial(setattr, target_reaction, 'lower_bound',
                            target_reaction.lower_bound))
            tm(do=int,
               undo=partial(setattr, target_reaction, 'upper_bound',
                            target_reaction.upper_bound))

            if view is None:
                view = config.default_view
            else:
                view = view

            self._init_search_grid(surface_only=surface_only,
                                   improvements_only=improvements_only)

            func_obj = _DifferentialFvaEvaluator(self.design_space_model,
                                                 self.variables,
                                                 self.objective,
                                                 self.included_reactions)
            if progress:
                progress = ProgressBar(len(self.grid))
                results = list(
                    progress(view.imap(func_obj, self.grid.iterrows())))
            else:
                results = list(view.map(func_obj, self.grid.iterrows()))

        solutions = dict((tuple(point.iteritems()), fva_result)
                         for (point, fva_result) in results)

        for sol in solutions.values():
            sol[sol.abs() < non_zero_flux_threshold] = 0.0
            intervals = sol.loc[self.included_reactions,
                                ['lower_bound', 'upper_bound']].values
            gaps = [
                self._interval_gap(interval1, interval2)
                for interval1, interval2 in zip(reference_intervals, intervals)
            ]
            sol['gaps'] = gaps
            if self.normalize_ranges_by is not None:
                # See comment above regarding normalization.
                normalizer = abs(sol.lower_bound[self.normalize_ranges_by])
                if normalizer > non_zero_flux_threshold:
                    normalized_intervals = sol.loc[
                        self.included_reactions,
                        ['lower_bound', 'upper_bound']].values / normalizer

                    sol['normalized_gaps'] = [
                        self._interval_gap(interval1, interval2)
                        for interval1, interval2 in zip(
                            normalized_reference_intervals,
                            normalized_intervals)
                    ]
                else:
                    sol['normalized_gaps'] = numpy.nan
            else:
                sol['normalized_gaps'] = gaps

        # Determine where the reference flux range overlaps with zero.
        zero_overlap_mask = numpy.asarray([
            self._interval_overlap(interval1, (0, 0)) > 0
            for interval1 in reference_intervals
        ],
                                          dtype=bool)
        collection = list()
        for key, df in solutions.items():
            df['biomass'] = key[0][1]
            df['production'] = key[1][1]

            df['KO'] = False
            df['flux_reversal'] = False
            df['suddenly_essential'] = False
            df['free_flux'] = False

            df.loc[(df.lower_bound == 0) & (df.upper_bound == 0) &
                   (~zero_overlap_mask), 'KO'] = True

            df.loc[((self.reference_flux_ranges.upper_bound < 0) &
                    (df.lower_bound > 0) |
                    ((self.reference_flux_ranges.lower_bound > 0) &
                     (df.upper_bound < 0))), 'flux_reversal'] = True

            df.loc[(zero_overlap_mask & (df.lower_bound > 0)) |
                   (zero_overlap_mask & (df.upper_bound < 0)),
                   'suddenly_essential'] = True

            is_reversible = numpy.asarray([
                self.design_space_model.reactions.get_by_id(i).reversibility
                for i in df.index
            ],
                                          dtype=bool)
            not_reversible = ~is_reversible

            df.loc[((df.lower_bound == -1000) &
                    (df.upper_bound == 1000) & is_reversible) |
                   ((df.lower_bound == 0) &
                    (df.upper_bound == 1000) & not_reversible) |
                   ((df.lower_bound == -1000) &
                    (df.upper_bound == 0) & not_reversible),
                   'free_flux'] = True

            df['reaction'] = df.index
            df['excluded'] = df['reaction'].isin(self.exclude)

            collection.append(df)


#        multi_index = [(key[0][1], key[1][1]) for key in solutions]
#        solutions_multi_index = pandas.concat(list(solutions.values()),
# axis=0, keys=multi_index)#
#        solutions_multi_index.index.set_names(['biomass', 'production',
# 'reaction'], inplace=True)
        total = pandas.concat(collection, ignore_index=True, copy=False)
        total.sort_values(['biomass', 'production', 'reaction'], inplace=True)
        total.index = total['reaction']
        return DifferentialFVAResult(total, self.envelope,
                                     self.reference_flux_ranges)
Example #22
0
    def build_fingerprint_matrices(self):
        # pathnames: List of paths to each piece for which a fingerprint matrix should be built
        # number_of_fingerprints: however many fingerprints you need
        interval_settings = self.interval_settings

        fingerprint_matrices = {}
        
        # Load pickled fingerprints
        if self.fp_pickle_path is not None:
            if os.path.isfile(self.fp_pickle_path):
                print "Found pickled fingerprints at '" + self.fp_pickle_path +"', importing..."
                with open(self.fp_pickle_path, 'rb') as fp_pickle:
                    fingerprint_matrices = pickle.load(fp_pickle)
            else:
                print "Warning: was asked to look for pickled fingerprints at '" + self.fp_pickle_path +"'"
                print "Couldn't find any -- new pickle file will be created."

        number_of_fingerprints = self.number_of_fingerprints

        for path in self.pathnames:
            # Skip pickled fingerprints
            if os.path.basename(path) in fingerprint_matrices.keys():
                continue
            # Setup for each piece
            #print("Indexing " + path)
            piece = IndexedPiece(path)
            piece_stream = music21.converter.parseFile(path)

            # LM: Get time signature and determine strong beats
            time_sigs = piece.get_data([metre.TimeSignatureIndexer])

            # Assuming no time signature change in whole piece, assign offsets to strong beats
            if time_sigs['metre.TimeSignatureIndexer']['0'].iloc[0] == '6/8' or time_sigs['metre.TimeSignatureIndexer']['0'].iloc[0] == '9/8':
                strong_beat_offsets = 1.5
                measures = 4
            else:
                strong_beat_offsets = 1.0
                measures = 4
            # LM: Get total number of offsets
            numer, denom = time_sigs['metre.TimeSignatureIndexer']['0'].iloc[0].split('/')
            # Four bars worth of offsets, ignoring anacrusis...
            # Add an extra strong beat at end 
            total_offsets = int(numer) * measures*4.0/int(denom) + strong_beat_offsets

            interval_settings['quarterLength'] = strong_beat_offsets
            interval_settings['intervalDistance'] = strong_beat_offsets
            interval_settings['subsection'] = (0.0, total_offsets)

            # LM: Build strong-interval frame
            strong_intervals = self.__build_strong_intervals(piece, interval_settings, strong_beat_offsets, total_offsets)

            # LM: Build weak-interval frame
            weak_intervals = self.__build_weak_intervals(piece, interval_settings, strong_beat_offsets, total_offsets)

            # LM: Assemble results
            # 1. Prepare strong_intervals -- had to change this due to change in representation... take off final column (start of new bar)
            strong_intervals = strong_intervals.T.iloc[:-1].T
            strong_intervals = self.__shift_matrix(strong_intervals)
            # Had to change this due to change in representation.... take off final row
            # strong_intervals = strong_intervals.iloc[:]
            
            # 2. Prepare weak_intervals:
            weak_intervals = weak_intervals.iloc[:]
            weak_intervals.index = my_range(strong_beat_offsets, strong_beat_offsets, total_offsets+strong_beat_offsets)

            # 3. Row of 0s --- added after discussion with Laura pertaining to fingerprint representation
            zeros = DataFrame(Series([0.0]*(len(weak_intervals))))
            zeros.index = (my_range(strong_beat_offsets, strong_beat_offsets, total_offsets+strong_beat_offsets))
            zeros = zeros.T

            # 4. Append 
            fingerprint_frame = pandas.concat([weak_intervals.T, zeros, strong_intervals])
            fingerprint_frame.index = (['w'] + fingerprint_frame.index.tolist()[1:])

            #piece_stream.show('musicxml', 'MuseScore')   
            #  DataFrame(Series([0.0]*(len(weak_intervals)+1))).reindex(range(1, len(weak_intervals)+1)).T
            fingerprint_matrices[os.path.basename(path)]=fingerprint_frame
                
            number_of_fingerprints -= 1
            if 0 == number_of_fingerprints:
                print "Max Number of Fingerprints Reached"
                break

        return fingerprint_matrices