def load_master_initial_merge():
    """
    GitHub Issue #3 should be fixed:
    Initial merge is Master SCORP .csv  + GEO .xlsx file.
    Logic should pull street type and
    address fields from Geo and overwrite master.

    """
    print("Loading SCORP Master...")
    scorp_master_file = r"State Comprehensive Outdoor Recreation Plan Inventory of Facilities\MasterSCORP_Base.xlsx"
    print("Loading SCORP GEO...")
    geo_master_file = r"State Comprehensive Outdoor Recreation Plan Inventory of Facilities\SCORP_FILTER_GEO.xlsx"
    # State index_col so that data matches correctly.
    sm = pd.read_excel(scorp_master_file, index_col="OBJECTID")
    gm = pd.read_excel(geo_master_file, index_col="OBJECTID")
    # Take Street type, street, town from geo, where available.
    sm["Street_Type"] = gm["Street_Type"]
    sm["Street"] = gm["Street"]
    sm['Town'] = gm['Town']
    print("Updated Street_Type, Street, Town from GEO to Master.")
    # Export MasterSCORP_Updated.csv to be new master,
    # then return the dataframe to whomeever called it.
    export_filename = r"State Comprehensive Outdoor Recreation Plan Inventory of Facilities\MasterSCORP_Updated.csv"
    print("Exporting to {}".format(export_filename))
    sm.to_csv(export_filename)
    return sm
def magicitems(dict, roll, row):
  if dict[roll]['MI Numb'] != '0':
    if dict[roll]['MI Numb 2'] != '0':
      times2 = diceroller(dict[roll]['MI Numb 2'])
      y = 0
      items2dict = {}
      try:
        mdf2 = pd.read_excel('Items.xlsx', sheet_name = dict[roll]['Item 2'], index_col = 0, usecols = 'E:F')
        mitems2 = dictcreator(mdf2)
        while y < times2:
          rolls = random.randint(1,100)
          items2dict['var_' + str(y)] = tk.Label(root, text = mitems2[rolls]['Item']).grid(row = row + 1, column = 1, columnspan = 2)
          y += 1
          row += 1
      except SyntaxError:
        pass
    times = diceroller(dict[roll]['MI Numb'])
    x = 0
    itemsdict = {}
    try:
      mdf = pd.read_excel('Items.xlsx', sheet_name = dict[roll]['Item'], index_col = 0, usecols = 'E:F')
      mitems = dictcreator(mdf)
      while x < times:
        rolls = random.randint(1,100)
        itemsdict["var_" + str(x)] = tk.Label(root, text = mitems[rolls]['Item']).grid(row = row + 1, column = 1, columnspan = 2)
        itemsdict["var_" + str(x)] = tk.Label(root, text = mitems[rolls]['Item']).grid(row = row + 1, column = 1, columnspan = 2)
        itemsdict["var_" + str(x)] = tk.Label(root, text = mitems[rolls]['Item']).grid(row = row + 1, column = 1, columnspan = 2)
        x += 1
        row += 1
    except SyntaxError:
      pass
  art(dict, roll, row)
Example #3
0
def main():
    # Several datafiles, each with a long list of subjects

    # Directory path variable assignment, assumes script is in working directory!!!
    DATA = "data"
    MEASURE = "measure"
    EXCEL = "excel_files"

    # Mainly for testing purposes
    if len(sys.argv) > 1: 
        DATA = os.path.join(sys.argv[1], DATA)
        MEASURE = os.path.join(sys.argv[1], MEASURE)
        FINAL = os.path.join(sys.argv[1], FINAL)


    # Create a dictionary with Subtest #s as the keys and a list of the data
    # file as values. Uses a Dictionary Comprehension
    SubTestIndex = [os.path.split(_file)[1].split('_')[0].split('Test')[1] for _file in glob(os.path.join(DATA,"*.txt"))]

    for sID in SubTestIndex:  # sID => subtest ID,  eg. Sub[03A]
        pXLXS = os.path.join(EXCEL, "Sub{0}_person_measure.xlsx".format(sID))
        pTXT = os.path.join(MEASURE, "Sub{0}_person_measure.txt".format(sID))

        if os.path.exists(pXLXS):
            person_measure = pd.read_excel(pXLXS, header=None, names=['Scores', 'NaN', 'SubID', '_SubID', '_NaN'])
            person_output = person_measure[['SubID', 'Scores']]
            person_output.to_csv(pTXT, sep='\t', index=False, header=False)

            iXLXS = os.path.join(EXCEL, "Sub{0}_item_measure.xlsx".format(sID))
            iTXT = os.path.join(MEASURE, "Sub{0}_item_measure.txt".format(sID))
            pd.read_excel(iXLXS, header=None).to_csv(iTXT, sep='\t', index=False, header=False)
Example #4
0
def grouping_parcels(group):

	# Parcel to census lookup
	df = pd.read_csv(r'R:\Brice\gis\parcels_urbansim_census.txt')

	# add low income geography tag
	low_inc = pd.read_excel(r'R:\Brice\gis\special-needs\ACS_15_5YR_Low-income.xlsx', sheetname='Map-income')
	minority = pd.read_excel(r'R:\Brice\gis\special-needs\ACS_15_5YR_Minority.xlsx', sheetname='Mapping')

	if group == 'low_income':
		# Threshold for % of households as total for determining low income or not
		income_threshold = 0.5

		# Define low_inc tracts as those with more HH below 200% median income than those above it
		low_inc['% low inc'] = low_inc['Below200']/low_inc['Total']

		# Create flag for whether low income or not
		low_inc.ix[low_inc['% low inc'] >= income_threshold,'low_inc_tract'] = 1
		low_inc.ix[low_inc['% low inc'] < income_threshold,'low_inc_tract'] = 0

		# Merge with parcel file
		newdf = pd.merge(df, low_inc[['GEOID10','low_inc_tract']], on='GEOID10', how='left')
		parcels_list = newdf[newdf['low_inc_tract'] == 1].parcelid.values

	elif group == 'minority':
		minority['% minority'] = minority['Minority']/minority['Total']
		minority_threshold = 0.5
		minority.ix[minority['% minority'] >= minority_threshold, 'minority_tract'] = 1
		minority.ix[minority['% minority'] < minority_threshold, 'minority_tract'] = 0

		# Merge with parcel file
		newdf = pd.merge(df, minority[['GEOID10','minority_tract']], on='GEOID10', how='left')
		parcels_list = newdf[newdf['minority_tract'] == 1].parcelid.values

	return parcels_list
def preprocess_greyc_nislab(in_file, out_file):
    """
    Preprocess the raw GREYC NISLAB dataset
    """
    df = pd.concat([pd.read_excel(in_file, sheetname=0),
                    pd.read_excel(in_file, sheetname=1),
                    pd.read_excel(in_file, sheetname=2),
                    pd.read_excel(in_file, sheetname=3),
                    pd.read_excel(in_file, sheetname=4)])

    df = df[df['Class'] == 2]

    df['age'] = (df['Age'] < 30).map({True: '<30', False: '>=30'})
    df['gender'] = df['Gender'].map({'F': 'female', 'M': 'male'})
    df['handedness'] = df['Handedness'].map({'L': 'left', 'R': 'right'})
    df['session'] = np.arange(len(df))

    df['password'] = df['Password'].map({
        'leonardo dicaprio': 1,
        'the rolling stones': 2,
        'michael schumacher': 3,
        'red hot chilli peppers': 4,
        'united states of america': 5,
    })

    def preprocess_row(idx_row):
        idx, row = idx_row
        keyname = list(map(lambda x: 'space' if x == ' ' else x, list(row['Password'])))
        v = np.array(row['Keystroke Template Vector'].strip().split()).astype(int) // 10000

        s = len(keyname) - 1
        pp, rr, pr, rp = [v[s * i:s * (i + 1)] for i in range(4)]

        timepress = np.r_[0, pp].cumsum()

        # Offset the first release time by the duration of the first key
        timerelease = np.r_[rp[0] - rr[0], rr].cumsum()

        # There are ~180 rows where timerelease == timepress.
        # Fix these by assuming at least the minimum standard clock resolution
        timerelease[timerelease == timepress] += 16
        sample = pd.DataFrame.from_items([
            ('user', row['User_ID']),
            ('session', row['session']),
            ('password', row['password']),
            ('age', row['age']),
            ('gender', row['gender']),
            ('handedness', row['handedness']),
            ('timepress', timepress),
            ('timerelease', timerelease),
            ('keyname', keyname)
        ])

        return sample

    df = pd.concat(map(preprocess_row, df.iterrows()))
    df = df.set_index(['user', 'session'])[COLS]
    df = remove_repeated_keys(df)
    df.to_csv(out_file)
    return
Example #6
0
	def labelspiezo(self) :
		"""
		Uses two sets of labels from EEG scorers to keep
		only those segments corresponding to agreedupon 
		scores 
		"""
		# First get file names
		lbls1name = self.filename.strip('.mat') + '.xls'
		lbls2name = self.filename.strip('.mat') + '_2.xls'

		# Import scores as dataframes
		lbls1 = pd.read_excel(self.filepath+lbls1name, header = None)
		lbls2 = pd.read_excel(self.filepath+lbls2name, header = None)

		# Concatenate into same dataframe and keep segments where equal
		concatted = pd.concat([lbls1[0],lbls2[0]],1)
		concatted.columns = ['scorer1','scorer2']
		scoredf = concatted[concatted['scorer1']==concatted['scorer2']]

		# scoredf is a dataframe with indices corresponding to the piezo
		# segments where there is agreement, and the identical labels in
		# each column

		# first reshape the piezo
		npr = np.reshape(self.piezo,[len(self.piezo)/(self.fs*4),self.fs*4])

		# this single function slices the reshaped piezo matrix such that
		# it retains only segments where doublescored
		self.piezomat = npr[scoredf.index]
		# as_matrix ensures indices are not saved since we need only labels
		self.labels = scoredf['scorer1'].as_matrix()
def vadir_get_cnames_replace(df_list, df_to_use):
    """
    This function determines the column differecnes between each
    of the excel files passed in.
    INPUT: list of excel files to import and the file with the
           right column names to use to compare against
    OUTPUT: dictionary of excel files as keys and list of unmatched
            columns as values of the dictionary
    """
    columns_to_use = []
    other_columns = {}
    unmatched_c = {}
    for df in df_list:
        if df == df_to_use:
            df_import = pd.read_excel(df)
            c_row = vadir_column_data_row(df_import)
            columns_to_use = vadir_clean_cnames(df_import, c_row)
            unmatched_c[df] = columns_to_use
        else:
            df_import = pd.read_excel(df)
            c_row = vadir_column_data_row(df_import)
            other_columns[df] = vadir_clean_cnames(df_import, c_row)
    for df, columns in other_columns.items():
        unmatched_c[df] = [c for c in columns if c not in columns_to_use]
    return unmatched_c
Example #8
0
    def download_iter(self, file, save_path="."):
        file_path = os.sep.join([save_path, file])

        if not os.path.exists(save_path):
            os.mkdir(save_path)

        with open(file_path, "wb") as cache:
            try:
                self.retrbinary("RETR %s" % file, cache.write)
            except:
                yield "", "", pandas.DataFrame(),False
                return

        if not zipfile.is_zipfile(file_path):
            ef = pandas.ExcelFile(file_path)
            yield file, ef.sheet_names[0], pandas.read_excel(ef),False
            return

        with zipfile.ZipFile(file_path, "r") as zip:
            xlss = []
            sheet_name = ""
            for name in zip.namelist():
                ef = pandas.ExcelFile(zip.open(name))
                sheet_name = ef.sheet_names[0]
                xls = pandas.read_excel(ef)
                yield name, sheet_name, xls,True
                xlss.append(xls)
Example #9
0
def qmflt(name="qfl.xlsx", Width=1, Color='k'):

    if("csv"in name):QmFLtRaw = pd.read_csv(name)
    elif("xlsx"in name):QmFLtRaw = pd.read_excel(name)

    QmFLtRaw = pd.read_excel(name)
    qmfltline(Width, Color)
    Points = len(QmFLtRaw)
    for i in range(Points):
        q = QmFLtRaw.at[i, 'Qm']
        f = QmFLtRaw.at[i, 'F']
        l = QmFLtRaw.at[i, 'Lt']

        Q = 100 * q / (q + f + l)
        F = 100 * f / (q + f + l)
        L = 100 * l / (q + f + l)

        x = Q / 2 + (100 - Q) * L / (L + F)
        y = Q / 2 * math.sqrt(3)

        plotpoint(x, y, QmFLtRaw.at[i, 'Size'], QmFLtRaw.at[i, 'Color'], QmFLtRaw.at[i, 'Alpha'],
                  QmFLtRaw.at[i, 'Marker'])
    plt.savefig("QmFLt-Plot.png", dpi=600)
    plt.savefig("QmFLt-Plot.svg", dpi=600)
    plt.show()
Example #10
0
def main(args):
    #Load the answer key
    #Answer key must have the headings ['Problem', 'Your Answer', 'Answer Format']
    answer_df = pd.read_excel(args.answer_key, sheet=0)

    #Score
    scores = {}

    #Go through the individual sheets
    for student_answer in glob.glob(os.path.join( args.assign_dir, '*xlsx') ):
        print(student_answer)
        student_df = pd.read_excel(student_answer, sheet=0)
        #Check to make sure that the column headings are equal
        if (student_df.columns != answer_df.columns).all():
            print('ERROR with: %s' % student_answer)
        else:
            #Proceed with grading
            equal = (answer_df['Your Answer'].str.lower() == student_df['Your Answer'].str.lower())
            #Count all the false values
            eqval = equal.value_counts()
            #Pull the students name
            path, fname = os.path.split(student_answer)
            student_name = fname.split('_')[0]
            #True grade set
            scores[student_name] = eqval[eqval.index == True].values[0]

    #Sort and print a csv
    with open('exam_scores.csv', 'w') as wfile:
        print('Student,Score', file=wfile)

        for sname in sorted( list(scores.keys()) ):
            print( '%s,%d' % (sname, scores[sname]), file=wfile)
Example #11
0
def get_from_excel(data_path, extra_sheet=None):
    '''
    This opens a file dialog allowing ot select an excel file containing
    the tracked data, and returns a :class:`CellCluster` object.

    Paramteters
    -----------

    data_path: the path to the excelTM file

    Returns
    -------

    cellcluster : a :class:`CellCluster` instance
         the container class for the tracking

    Notes
    -----

    The excel file should follow the structure of `excel_trajs_example.xlsx`
    in the project's `data` directory
    '''

    ### Read the data
    trajs = pd.read_excel(data_path, 0)
    trajs.t_stamp = trajs.t_stamp.astype(np.int)
    trajs.label = trajs.label.astype(np.int)
    trajs.set_index(['t_stamp', 'label'],
                    inplace=True)

    ### The Trajectories class is a subclass of
    ### pandas DataFrame
    ### Parsing excel files tends to add NaNs to the data
    trajs = Trajectories(trajs.dropna().sortlevel())
    metadata = pd.read_excel(data_path, 1)
    metadata = {name: value for name, value
                in zip(metadata['Name'], metadata['Value'])}

    metadata['FileName'] = data_path
    store_path = metadata['FileName']
    if '.' in store_path[-6:]:
        store_path = ''.join(store_path.split('.')[:-1]+['.h5'])
    else:
        store_path = store_path+'.h5'
    store_path = os.path.join(
        os.path.dirname(data_path), store_path)

    ### The ObjectsIO class
    objectsio = ObjectsIO(metadata=metadata, store_path=store_path)
    cellcluster = CellCluster(objectsio=objectsio)
    cellcluster.trajs = trajs
    cellcluster.oio['trajs'] = trajs
    if extra_sheet is not None:
        try:
            extra = pd.read_excel(data_path, extra_sheet)
            cellcluster.extra = extra
            cellcluster.oio['extra'] = extra
        except:
            print('Extra data from sheet {} not found in the file {}'.format(extra_sheet, data_path))
    return cellcluster
Example #12
0
def extract_bloomberg_excel(str_bbDataFile, str_bbIndexFile,is_excel):
    '''
    블룸버그에서 받아온 엑셀파일을 dataframe 형식으로 변경하여 저장
    :param str_bbDataFile: 실제 데이터 파일
    :param str_bbIndexFile: 메타파일
    '''
    
    global df_bbData, df_bbDataCol
    
    if(is_excel):
        #데이터
        df_bbData = pd.read_excel(str_bbDataFile,'Sheet1')
        df_bbData = df_bbData.ix[5:,:] #제목행 및 날짜 없는행 제거
        df_bbData = df_bbData.replace('#N/A N/A','') #엑셀에서 데이터 없는 셀에 들어간 문자열 제거
        df_bbData = df_bbData.convert_objects(convert_numeric=True) #모든 컬럼은 숫자형식으로 변환
        
        #리스트
        df_bbIndex = pd.read_excel(str_bbIndexFile, 'index')
        df_bbIndex.columns = ['no','idx','cat','rgn','rgn2','rmk','undf']
        df_bbDataCol = df_bbIndex[df_bbIndex['no'].isin(df_bbData.columns)][['no','idx','rgn2']]
        
        #csv로 저장
        df_bbData.to_csv('../data/DailyEconomicData.csv',sep='\t',encoding='utf-8')
        df_bbDataCol.to_csv('../data/index.csv',sep='\t',encoding='utf-8')
    else:
        df_bbData = pd.read_csv('../data/DailyEconomicData.csv',sep='\t',encoding='utf-8')
        df_bbDataCol = pd.read_csv('../data/index.csv',sep='\t',encoding='utf-8')
Example #13
0
def xl_to_df(directory, file_dict):

	# Get excel file

	file_path = ''

	for file in file_dict:
		if not file['type'] == 'questions':
			file_path = str(directory) + '\\' + file['name']

		else:
			file_path2 = str(directory) + '\\' + file['name']

	main1 = pd.read_excel(file_path, pd.ExcelFile(file_path).sheet_names[0], encoding='utf-8')
	main2 = pd.read_excel(file_path2, pd.ExcelFile(file_path2).sheet_names[0], encoding='utf-8')

	# xls_file = pd.ExcelFile(file_path)
	# main1 = xls_file.parse( xls_file.sheet_names[0] )

	# xls_file2 = pd.ExcelFile(file_path2)
	# main2 = xls_file2.parse( xls_file2.sheet_names[0] )

	full_df = main1.append(main2)

	return full_df
Example #14
0
File: views.py Project: Mihkorz/AMD
 def get_context_data(self, **kwargs):
     context = super(DocumentDetail, self).get_context_data(**kwargs)
     
     filename = settings.MEDIA_ROOT+"/"+self.object.document.name 
     if self.object.doc_type == 1:
         sniffer = csv.Sniffer()
         dialect = sniffer.sniff(open(filename, 'r').read(), delimiters='\t,;') # defining the separator of the csv file
         df = read_csv(filename, delimiter=dialect.delimiter)
         context['input'] = df[:50].to_html()
     else:
         try:
             df = read_excel(filename, sheetname="PMS")
             context['PMS'] = df.to_html()
         except:
             pass
         try:
             df = read_excel(filename, sheetname="PMS1")
             context['PMS1'] = df.to_html()
         except:
             pass
         try:
             df = read_excel(filename, sheetname="DS1")
             context['DS1'] = df.to_html()
         except:
             pass
         try:
             df = read_excel(filename, sheetname="DS2")
             context['DS2'] = df.to_html()
         except:
             pass
     
     
     return context  
Example #15
0
def load_references(xls_filename, errors, validation_errors):
    # Output columns can be different. Update according to the rename_columns dict:
    try:
        dfs = pandas.read_excel(xls_filename,
                                [#'core-24-depts',
                                 '(reference) senior-staff-grades',
                                 '(reference) professions',
                                 '(reference) units',
                                 ])
    except XLRDError, e:
        if str(e) == "No sheet named <'(reference) units'>":
            validation_errors.append(str(e))
            return {}
        elif str(e) in ("No sheet named <'(reference) senior-staff-grades'>",
                      "No sheet named <'(reference) professions'>"):
            # this doesn't matter - we will use the standard_references
            # anyway. Read it again, just for the units.
            try:
                dfs = pandas.read_excel(xls_filename, ['(reference) units'])
            except XLRDError, e:
                if str(e) == "No sheet named <'(reference) units'>":
                    validation_errors.append(str(e))
                else:
                    errors.append(str(e))
                return {}
Example #16
0
def excel(FilePath, FileName, SheetNameOrNone, *args, **kwargs):
	IndexColumn = kwargs.get('IndexColumn',None)
	from pandas import read_excel
        
	if FilePath.endswith('\\'):
		lastslash=''
	else:
		lastslash='\\'
            
	if FileName.endswith('.xlsx'):
		fext=''
	else:
		fext='.xlsx'
	
	while True:
		try:
			fullpath=FilePath+lastslash+FileName+fext
			EmptyVar=read_excel(fullpath, SheetNameOrNone, index_col=IndexColumn, na_values=['NA'])
			break
		except:
			fext='.xls'
			fullpath=FilePath+lastslash+FileName+fext
			EmptyVar=read_excel(fullpath, SheetNameOrNone, index_col=IndexColumn, na_values=['NA'])
			break

	return EmptyVar;
Example #17
0
def get_hs300s():
    """
    获取沪深300当前成份股及所占权重
    Return
    --------
    DataFrame
        code :股票代码
        name :股票名称
        date :日期
        weight:权重
    """
    try:
        df = pd.read_excel(
            ct.HS300_CLASSIFY_URL % (ct.P_TYPE["http"], ct.DOMAINS["idx"], ct.INDEX_C_COMM, ct.PAGES["hs300b"]),
            parse_cols=[0, 1],
        )
        df.columns = ct.FOR_CLASSIFY_B_COLS
        df["code"] = df["code"].map(lambda x: str(x).zfill(6))
        wt = pd.read_excel(
            ct.HS300_CLASSIFY_URL % (ct.P_TYPE["http"], ct.DOMAINS["idx"], ct.INDEX_C_COMM, ct.PAGES["hs300w"]),
            parse_cols=[0, 3, 6],
        )
        wt.columns = ct.FOR_CLASSIFY_W_COLS
        wt["code"] = wt["code"].map(lambda x: str(x).zfill(6))
        return pd.merge(df, wt)
    except Exception as er:
        print(str(er))
def get_transfert_data_frames(year=None):
    assert year is not None
    default_config_files_directory = os.path.join(
        pkg_resources.get_distribution("openfisca_france_indirect_taxation").location
    )
    matrice_passage_file_path = os.path.join(
        default_config_files_directory,
        "openfisca_france_indirect_taxation",
        "assets",
        "Matrice passage {}-COICOP.xls".format(year),
    )
    parametres_fiscalite_file_path = os.path.join(
        default_config_files_directory,
        "openfisca_france_indirect_taxation",
        "assets",
        "Parametres fiscalite indirecte.xls",
    )
    matrice_passage_data_frame = pandas.read_excel(matrice_passage_file_path)
    if year == 2011:
        matrice_passage_data_frame["poste2011"] = matrice_passage_data_frame["poste2011"].apply(
            lambda x: int(x.replace("c", "").lstrip("0"))
        )
    parametres_fiscalite_data_frame = pandas.read_excel(parametres_fiscalite_file_path, sheetname="categoriefiscale")
    selected_parametres_fiscalite_data_frame = parametres_fiscalite_data_frame[
        parametres_fiscalite_data_frame.annee == year
    ]
    return matrice_passage_data_frame, selected_parametres_fiscalite_data_frame
Example #19
0
    def __init__(self, file, year=None, level="Départements"):
        """
        loads the data downloaded from `data.gouv.fr <http://www.data.gouv.fr/content/search?SortBy=Pertinence&SortOrder=0&SearchText=%C3%A9lections+2012>`_.

        @param      file        xls file
        @param      year        year (optional)
        @param      level       ``Départements`` or ``Cantons``
        """
        self.year = year
        self.level = level.lower().replace("s", "")
        if isinstance(file, list):
            self.tours = file
        else:
            self.tours = [pandas.read_excel(file, sheetname="%s T1" % level),
                          pandas.read_excel(file, sheetname="%s T2" % level)]
            for i, t in enumerate(self.tours):
                if len(t) == 0:
                    raise Exception("no data for tour %d" % (i + 1))
            self.tours = [self.process_tour(_) for _ in self.tours]
            for i, t in enumerate(self.tours):
                if len(t) == 0:
                    raise Exception("no data for tour %d" % i)
            try:
                self.tours = [
                    _.sort_values("Libellé du %s" % self.level, inplace=False) for _ in self.tours]
            except Exception as e:
                message = "unable to sort, shape={1} columns={0}".format(
                    ",".join(self.tours[0].columns), self.tours[0].shape)
                raise Exception(message) from e
Example #20
0
def get_barres_seq_data(force=False):
    global BARRES_SPECIES_DATA

    if force or not os.path.exists(BARRES_SEQ_PATH):
        LOGGER.info("Downloading Barres RNA Seq Data")
        response = requests.get(BARRES_SEQ_URL, stream=True)
        response.raise_for_status()

        with open(BARRES_SEQ_PATH, mode="wb") as f:
            for block in response.iter_content(1024):
                f.write(block)

    LOGGER.info("Reading Barres RNA Seq Data")
    BARRES_SPECIES_DATA = {
        "H**o sapiens": pd.read_excel(
            BARRES_SEQ_PATH,
            sheet_name="Human data only",
            skiprows=[0],
        ).iloc[1:],
        "Mus musculus": pd.read_excel(
            BARRES_SEQ_PATH,
            sheet_name="Mouse data only",
            skiprows=[0],
        ),
    }
Example #21
0
def load_All_BAVs(BAVfile,sheet_names):
    x = pandas.read_excel(BAVfile, sheet_names[0], index_col=0, na_values=['NA']).index
    data= dict();
    for sheet in sheet_names:
        df= pandas.read_excel(BAVfile, sheet, index_col=0, na_values=['NA'])
        x = intersect(x, df.index)
        
    for sheet in sheet_names:
        df =pandas.read_excel(BAVfile, sheet, index_col=0, na_values=['NA'])        
        good_cols = [col for col in df.columns if len(col.split("_"))==2]
        df= df[good_cols]
        df.columns = map(lambda x: x.split("_")[0],df.columns)
        
        try:        
            del df[u"Tough"]
        except:
            print "oh well"
        try:        
            del df[u"Visionary"]
        except:
            print "oh well"
        df=df[pruned_words]
        df = df.ix[x]        
        data[sheet]=df

    return (x,data)
Example #22
0
    def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names,
                                       c_idx_levels, r_idx_levels):
        # see gh-4679
        with ensure_clean(ext) as pth:
            if c_idx_levels == 1 and c_idx_names:
                pytest.skip("Column index name cannot be "
                            "serialized unless it's a MultiIndex")

            # Empty name case current read in as
            # unnamed levels, not Nones.
            check_names = r_idx_names or r_idx_levels <= 1

            df = mkdf(5, 5, c_idx_names, r_idx_names,
                      c_idx_levels, r_idx_levels)
            df.to_excel(pth)

            act = pd.read_excel(pth, index_col=list(range(r_idx_levels)),
                                header=list(range(c_idx_levels)))
            tm.assert_frame_equal(df, act, check_names=check_names)

            df.iloc[0, :] = np.nan
            df.to_excel(pth)

            act = pd.read_excel(pth, index_col=list(range(r_idx_levels)),
                                header=list(range(c_idx_levels)))
            tm.assert_frame_equal(df, act, check_names=check_names)

            df.iloc[-1, :] = np.nan
            df.to_excel(pth)
            act = pd.read_excel(pth, index_col=list(range(r_idx_levels)),
                                header=list(range(c_idx_levels)))
            tm.assert_frame_equal(df, act, check_names=check_names)
Example #23
0
    def test_excel_passes_na(self, read_ext):

        excel = ExcelFile('test4' + read_ext)

        parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False,
                               na_values=['apple'])
        expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']],
                             columns=['Test'])
        tm.assert_frame_equal(parsed, expected)

        parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True,
                               na_values=['apple'])
        expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
                             columns=['Test'])
        tm.assert_frame_equal(parsed, expected)

        # 13967
        excel = ExcelFile('test5' + read_ext)

        parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False,
                               na_values=['apple'])
        expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']],
                             columns=['Test'])
        tm.assert_frame_equal(parsed, expected)

        parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True,
                               na_values=['apple'])
        expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
                             columns=['Test'])
        tm.assert_frame_equal(parsed, expected)
Example #24
0
 def test_read_excel_nrows(self, read_ext):
     # GH 16645
     num_rows_to_pull = 5
     actual = pd.read_excel('test1' + read_ext, nrows=num_rows_to_pull)
     expected = pd.read_excel('test1' + read_ext)
     expected = expected[:num_rows_to_pull]
     tm.assert_frame_equal(actual, expected)
Example #25
0
 def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext):
     # GH 16645
     expected = pd.read_excel('test1' + read_ext)
     num_records_in_file = len(expected)
     num_rows_to_pull = num_records_in_file + 10
     actual = pd.read_excel('test1' + read_ext, nrows=num_rows_to_pull)
     tm.assert_frame_equal(actual, expected)
Example #26
0
    def test_reader_dtype(self, read_ext):
        # GH 8212
        basename = 'testdtype'
        actual = pd.read_excel(basename + read_ext)

        expected = DataFrame({
            'a': [1, 2, 3, 4],
            'b': [2.5, 3.5, 4.5, 5.5],
            'c': [1, 2, 3, 4],
            'd': [1.0, 2.0, np.nan, 4.0]}).reindex(
                columns=['a', 'b', 'c', 'd'])

        tm.assert_frame_equal(actual, expected)

        actual = pd.read_excel(basename + read_ext,
                               dtype={'a': 'float64',
                                      'b': 'float32',
                                      'c': str})

        expected['a'] = expected['a'].astype('float64')
        expected['b'] = expected['b'].astype('float32')
        expected['c'] = ['001', '002', '003', '004']
        tm.assert_frame_equal(actual, expected)

        with pytest.raises(ValueError):
            pd.read_excel(basename + read_ext, dtype={'d': 'int64'})
Example #27
0
    def test_excel_read_buffer(self, read_ext):

        pth = 'test1' + read_ext
        expected = pd.read_excel(pth, 'Sheet1', index_col=0)
        with open(pth, 'rb') as f:
            actual = pd.read_excel(f, 'Sheet1', index_col=0)
            tm.assert_frame_equal(expected, actual)
Example #28
0
	def __init__(self):
		## PDZ Domains
		temp_df = pd.read_excel(DATA+'\\theta_data.xlsx')
		self.aminoacids = [acid.encode('utf-8') for acid in list(temp_df.columns[:20])]
		self.df = temp_df.T
		self.domains = [Domain(domain.encode('utf-8')) for domain in list(self.df.columns)]
		self.domain_names = [domain.name for domain in self.domains]
		### Peptide sequences
		self.pep_seqs = []
		self.pep_names = []
		self.acid_names = ['Glycine', 'Alanine', 'Valine', 'Leucine', 'Isoleucine', 'Methionine', 'Proline', 'Phenylalanine', 'Tryptophan', 'Serine', \
		              'Threonine', 'Asparagine', 'Glutamine', 'Tyrosine', 'Cysteine', 'Lysine', 'Arginine', 'Histidine', 'Aspartate', 'Glutamate']
		self.acid_dict = {self.aminoacids[i]:self.acid_names[i] for i in range(len(self.aminoacids))}
		with open(DATA+'\\peptides.free') as f:
			for line in f:
				x = line.split()
				self.pep_seqs.append(x[1])
				self.pep_names.append(x[0])
		self.peptides = [Peptide(name) for name in self.pep_names]

		## Interaction: Which peptides bind to which domains
		self.fp_interaction_matrix = pd.read_excel(DATA+"\\fp_interaction_matrix.xlsx")
		for column in self.fp_interaction_matrix.columns:
			self.fp_interaction_matrix.loc[self.fp_interaction_matrix[column] == 0.0, column] = -1.0
		self.fp_interaction_matrix = self.fp_interaction_matrix.rename(columns = lambda x: str(x).replace(" ", ""))

		## Classification matrix
		self.class_matrix = np.zeros((2,2))
		self.class_matrix[0,0] = 0.85
		self.class_matrix[0,1] = 0.04
		self.class_matrix[1,0] = 0.15
		self.class_matrix[1,1] = 0.96
Example #29
0
 def __init__(self, db_filename = "fbo_solicitations.xlsx",
              report_prefix = "report", 
              sol_sheet_name = "solicitations",
              filtered_sheet_name = "filtered_solicitations",
              index_column = "sponsor_number",
              report_only_new = True):
     '''
     Constructor
     '''
     if(not os.path.isfile(db_filename)):
         #generate a blank writable excel sheet from scratch
         field_names = [field_name for field_name in Opportunity.fields]
         field_names.remove("filtered")
         writer = ExcelWriter(db_filename)
         sol_df = pd.DataFrame(columns = field_names)
         filtered_df = pd.DataFrame(columns = field_names)
         sol_df.to_excel(writer,sol_sheet_name)
         filtered_df.to_excel(writer,filtered_sheet_name)
         writer.save()
         writer.close()
     
     self.report_filename = (report_prefix + "_" 
                             + str(datetime.today())[:19]
                             .replace(":","_").replace(" ","[") + "].xlsx")
     #kept for posterity, in case only the date component is needed and we don't care about overwrites
     #self.report_filename = report_prefix + "_" + str(date.today())
     self.db_filename = db_filename
     self.sol_sheet_name = sol_sheet_name
     self.filtered_sheet_name = filtered_sheet_name
     self.sol_df = pd.read_excel(db_filename,sol_sheet_name, index_col = index_column)
     self.filtered_df = pd.read_excel(db_filename,filtered_sheet_name, index_col = index_column)
     self.usaved_sol_counter = 0
     self.sol_counter = 0
     self.added_items = set()
Example #30
0
def main(left_file="",
         right_file="",
         out_file="",
         on=[],
         how="left"):
    """given two xlsx (excel) workbooks, each containing one worksheet,
join the two worksheets and output a new workbook.

Parameters:

* `-l, --left-file`: The workbook which contains the worksheet
to consider the "left" table
* `-r, --right-file`: The workbook which contains the worksheet
to consider the "right" table
* `-o, --out-file`: The file to output the "joined" tables to
* `-O, --on`: A (space-seperated) list of column names to join on
* `-H, --how`: how to join the two tables, must be one of "left",
"right", "outer" or "inner"

For more information on joining tables please see the
[pandas dataframe merge documentation](http://pandas.pydata.org/pandas-docs/version/0.17.1/generated/pandas.DataFrame.merge.html)
    """
    left = pd.read_excel(left_file)
    right = pd.read_excel(right_file)
    new = pd.merge(left, right, on=on, how=how)
    print "SAVING {}".format(out_file)
    new.to_excel(out_file, index=False)
Example #31
0
df["grade"].cat.categories = ["very good", "good", "very bad"]
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium",\
                                              "good", "very good"])
print df["grade"]
print df.sort_values(by="grade")
print df.groupby("grade").size()
''' Plotting '''
ts = pd.Series(np.random.randn(1000),\
               index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,\
                  columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
plt.figure()
df.plot()
plt.legend(loc='best')
''' Getting Data In/Out '''
# CSV
df.to_csv('foo.csv')
print pd.read_csv('foo.csv')

# HDF5
df.to_hdf('foo.h5', 'df')
print pd.read_hdf('foo.h5', 'df')

# Excel
df.to_excel('foo.xlsx', sheet_name='Sheet1')
print pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
''' Gotchas '''
os.chdir("/jukebox/wang/zahra/python/BrainPipe")
from tools.analysis.network_analysis import make_structure_objects

#set appropriate pth
src = "/jukebox/wang/zahra/kelly_cell_detection_analysis"
erode_pth = os.path.join(src, "annotation_allen_2017_25um_sagittal_erode_80um.tif")
dilate_pth = os.path.join(src, "dilated_atlases")

fig_dst = "/home/wanglab/Desktop"
df_pth = "/jukebox/LightSheetTransfer/atlas/allen_atlas/allen_id_table_w_voxel_counts_16bit.xlsx"
ann_pth = "/jukebox/LightSheetTransfer/atlas/allen_atlas/annotation_2017_25um_sagittal_forDVscans_16bit.tif"

#%%
#read vols
ann = sitk.GetArrayFromImage(sitk.ReadImage(ann_pth))
df = pd.read_excel(df_pth)
er_ann = tifffile.imread(erode_pth)
dl_anns = [os.path.join(dilate_pth, xx) for xx in os.listdir(dilate_pth)]

org_iids = np.unique(ann)[1:] #excluding 0
er_iids = np.unique(er_ann)[1:]

missing = [iid for iid in org_iids if iid not in er_iids]

missing_struct_names = [nm for nm in df.name.values if df.loc[df.name == nm, "id"].values[0] in missing] #excluding root
missing_struct_voxels = [df.loc[df.name == nm, "voxels_in_structure"].values[0] for nm in missing_struct_names]
#replace id column that matches to names
missing_struct_ids = [df.loc[df.name == nm, "id"].values[0] for nm in missing_struct_names]

#get parent names
missing_struct_parents = [df.loc[df["id"] == iid, "parent_name"].values[0]
# Import Built-Ins
import logging
# Import Third-Party
# Import Homebrew
import matplotlib.pyplot as plt

plt.style.use('bmh')
# Init Logging Facilities
log = logging.getLogger(__name__)
#################################################################
# 1- Load data
indicators_value = []
ticker_name = []
glob.glob("D:\Stock Study Excel Files\Input Excel Files\Stock USA\*.xlsx")
for f in glob.glob('D:\Stock Study Excel Files\Input Excel Files\Stock USA\*.xlsx'):
    df = pd.read_excel(f)
   # df.columns = map(str.capitalize, df.columns)
    #df.rename(columns={'Volume': 'Volume_BTC'}, inplace=True)
    tike = f.split('\\')[-1].split('.')[0]
    print(tike)
    df.insert(1, 'TICKER', tike)  # to bring excel file name
    # Clean nan values
    df = ta.utils.dropna(df)
    ####################################################################
    # 2-Add all ta features filling nans values (from Ta-Lib Except SuperTrend Because not in Ta-Lib)
    df = ta.add_all_ta_features(df, "Open", "High", "Low", "Close", "Volume_BTC", fillna=True)

    #####################################################################
    # 3- Calculate
    df['Signal'] = 0
    sell = []
import matplotlib.pyplot as plt
import pandas as pd
url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'
df = pd.read_csv(url,sep=";")
print(df.head())
pd.DataFrame.hist(df.ix[:, 0:1])
plt.xlabel('fixed acidity (g(tartaric acid)/dm$^3$)')
plt.ylabel('count')
plt.show()

#Importing non-flat files from the web

import pandas as pd
url = 'http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls'
xls = pd.read_excel(url,sheet_name=None)
print(xls.keys())
print(xls['1700'].head())

#Performing HTTP requests in Python using urllib

from urllib.request import urlopen,Request
url = "http://www.datacamp.com/teach/documentation"
request = Request(url)
response = urlopen(request)
print(type(response))
response.close()

#Printing HTTP request results in Python using urllib

from urllib.request import urlopen, Request
Example #35
0
from pprint import pprint
import os
import  json  
import csv



from flask import (
    Flask,
    render_template,
    jsonify,
    request,
    redirect,
    url_for,)

dataset1 = pd.read_excel("BronxPropertySalesDatasets/sales_bronx_03.xls")


app = Flask (__name__)

SITE_ROOT = os.path.realpath(os.path.dirname(__file__))
json_url = os.path.join(SITE_ROOT, "data", "data.json")
data = json.load(open(json_url))


AIRBNB_SITE_ROOT = os.path.realpath(os.path.dirname(__file__))
airbnb_json_url = os.path.join(AIRBNB_SITE_ROOT, "data", "airbnb_data.json")
data_airbnb = json.load(open(airbnb_json_url))


PROPERTYSALES_SITE_ROOT = os.path.realpath(os.path.dirname(__file__))
Example #36
0
def main():
    import cdsapi
    import numpy as np
    import os
    import pandas as pd
    import math

    def quarter_up(x):
        return math.ceil(x * 4) / 4

    def quarter_down(x):
        return math.floor(x * 4) / 4

    c = cdsapi.Client()

    file = '/Volumes/Neely/BioDAR/ERA5/sites of light and suction traps.xlsx'
    suction_traps = pd.read_excel(file, header=0, sheet_name='Suction traps')
    number_of_traps = len(suction_traps['Lat'])
    areas = []
    trap_name = []

    for a in range(0, number_of_traps):
        lats = [
            quarter_up(suction_traps['Lat'][a]),
            quarter_down(suction_traps['Lat'][a])
        ]
        longs = [
            quarter_up(suction_traps['Long'][a]),
            quarter_down(suction_traps['Long'][a])
        ]
        areas.append([
            max(lats),
            min(longs),
            min(lats),
            max(longs),
        ])
        trap_name.append(suction_traps['Trap name'][a].replace(" ", "_"))

    start_year = 1979
    stop_year = 2020

    years = np.arange(start_year, stop_year + 1)
    months = [
        '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'
    ]
    days = [
        '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12',
        '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
        '25', '26', '27', '28', '29', '30', '31'
    ]

    for year in years:
        for month in months:
            for day in days:
                for idx, area in enumerate(areas):
                    try:
                        outdir = '/Volumes/Neely/BioDAR/ERA5/Myrna_TrapLocations_0_25_Box/suction_traps/pres_levels/' \
                                 + str(trap_name[idx]) + '/'
                        if not os.path.exists(outdir):
                            os.makedirs(outdir)
                        file_name = outdir + 'era5_pres_level_' + str(trap_name[idx]) + '_' + \
                                    str(year) + str(month) + str(day) + '.nc'
                        print(str(trap_name[idx]), area)

                        print(file_name)

                        if os.path.isfile(file_name) == True:
                            print('exists')
                            continue

                        else:
                            c.retrieve(
                                'reanalysis-era5-pressure-levels', {
                                    'product_type':
                                    'reanalysis',
                                    'format':
                                    'netcdf',
                                    'variable': [
                                        'divergence',
                                        'fraction_of_cloud_cover',
                                        'geopotential',
                                        'ozone_mass_mixing_ratio',
                                        'potential_vorticity',
                                        'relative_humidity',
                                        'specific_cloud_ice_water_content',
                                        'specific_cloud_liquid_water_content',
                                        'specific_humidity',
                                        'specific_rain_water_content',
                                        'specific_snow_water_content',
                                        'temperature',
                                        'u_component_of_wind',
                                        'v_component_of_wind',
                                        'vertical_velocity',
                                        'vorticity',
                                    ],
                                    'pressure_level': [
                                        '1',
                                        '2',
                                        '3',
                                        '5',
                                        '7',
                                        '10',
                                        '20',
                                        '30',
                                        '50',
                                        '70',
                                        '100',
                                        '125',
                                        '150',
                                        '175',
                                        '200',
                                        '225',
                                        '250',
                                        '300',
                                        '350',
                                        '400',
                                        '450',
                                        '500',
                                        '550',
                                        '600',
                                        '650',
                                        '700',
                                        '750',
                                        '775',
                                        '800',
                                        '825',
                                        '850',
                                        '875',
                                        '900',
                                        '925',
                                        '950',
                                        '975',
                                        '1000',
                                    ],
                                    'year': [str(year)],
                                    'month': [month],
                                    'day': [day],
                                    'time': [
                                        '00:00',
                                        '01:00',
                                        '02:00',
                                        '03:00',
                                        '04:00',
                                        '05:00',
                                        '06:00',
                                        '07:00',
                                        '08:00',
                                        '09:00',
                                        '10:00',
                                        '11:00',
                                        '12:00',
                                        '13:00',
                                        '14:00',
                                        '15:00',
                                        '16:00',
                                        '17:00',
                                        '18:00',
                                        '19:00',
                                        '20:00',
                                        '21:00',
                                        '22:00',
                                        '23:00',
                                    ],
                                    'area':
                                    area,
                                }, file_name)
                    except:
                        continue
Example #37
0
def convert_xlsx(filename, sheetname, csv_name):
    data_xls = pandas.read_excel(filename, sheetname, convert_float=False, index_col=None)
    data_xls.to_csv(csv_name, encoding='utf-8')
Example #38
0
def xl2csv(path):
    df = pandas.read_excel(path)
    csvfileloc = '/home/py01/Desktop/ratings5.csv'
    df.to_csv(csvfileloc, sep='\t', encoding='utf-8', index=False)
    os.remove(path)
            items = [int(item) for item in items]
            df[col] = pd.Series(items, dtype=int)
        else:
            df[col] = pd.Series(items, dtype=float)

##############################################################################
################################### MAIN #####################################
##############################################################################

if __name__ == '__main__':
    # make output directory if not exists
    cwd = os.getcwd()
    if not cwd.endswith('/'): 
        cwd += '/'
    if not OUTDIR.endswith('/'): 
        OUTDIR += '/'
    try:
        os.mkdir(OUTDIR)
        print('Output folder created: %s' % (cwd + OUTDIR))
    except:
        pass

    # iterate through each sheet and format / spit out CSV for D3
    for i in range(len(SHEET_NAMES)):
        print('\nWorking on %s sheet' % SHEET_NAMES[i])
        df = pd.read_excel(INFILE, sheet_name=i+1, dtype=str, na_filter=False)
        remove_trailing_whitespace(df)
        typify_dataframe(df)
        df.to_csv(OUTDIR + SHEET_NAMES[i] + '.csv', index=None)
        print('Saved to %s' % (cwd + OUTDIR + SHEET_NAMES[i] + '.csv'))
import warnings
warnings.filterwarnings("ignore")




# Get dataset and features
#==============================#

aalist = list('ACDEFGHIKLMNPQRSTVWY')
def getAAC(seq):
    aac = np.array([seq.count(x) for x in aalist])/len(seq)
    return aac

data = pd.read_excel('sequence_ogt_topt.xlsx', index_col=0)
aac = np.array([getAAC(seq) for seq in data['sequence']])
ogt = data['ogt'].values.reshape((data.shape[0],1))
X = np.append(aac, ogt, axis=1)
sc = StandardScaler()
X = sc.fit_transform(X)
y = data['topt'].values



# Strategies and hyperparameters
#======================================#

# Hyperparameter range
cl_vals = [25.0, 30.0, None]
ch_vals = [72.2, 60.0]
import pandas as pd
from sklearn.neural_network import MLPClassifier
import numpy as np
import json
import scipy as sc
import numpy as np

train_file=pd.read_json(r'D:\Data Analysis\Project2\Project2\Data-Analysis-Project2\train.json', orient='records')
test_file=pd.read_csv(r'D:\Data Analysis\Project2\Project2\Data-Analysis-Project2\sample_solution.csv', header=0)    
train_inp=pd.read_excel(r'D:\Data Analysis\Project2\Project2\Data-Analysis-Project2\Train_Input.xlsx',index_col=None,header=None)
test_inp=pd.read_excel(r'D:\Data Analysis\Project2\Project2\Data-Analysis-Project2\Test_Input.xlsx',index_col=None,header=None)
train_l=train_file['cuisine']
test_l=test_file['cuisine']
train_label=train_l[0:4000]
test_label=test_l[0:1000]




best=0
accuracy=0
hl=[1,3]
act=['logistic', 'tanh', 'relu']
sol=['lbfgs','sgd','adam']
al=[0.0001,0.0005]
bs=[64,128]
lr=['constant','invscaling','adaptive']
best_params = [0,0,0,0,0,0]
params = [0,0,0,0,0,0]
for h in hl:
    for a in act:
import pandas as pd
import sys

input_file = sys.argv[1]
output_file = sys.argv[2]

data_frame = pd.read_excel(input_file, sheet_name='january_2013')

writer = pd.ExcelWriter(output_file)
data_frame.to_excel(writer, sheet_name='jan_13_output', index=False)
writer.save()
Example #43
0
                  category_compare, padding)
from BusinessPulseSurvey import business_pulse, qa_for_loc, qa_by_loc, compare_questions_locations, stacked_by_loc, qa_diff_by_loc

dir_path = os.path.dirname(os.path.abspath(__file__))

if not os.path.exists(config.log_dir):
    os.makedirs(config.log_dir)
if not os.path.exists(config.log_dir + config.log_file):
    with open(config.log_dir + config.log_file, 'w+'):
        pass
logging.basicConfig(filename=config.log_dir + config.log_file,
                    level=logging.INFO)
logging.info('%s Economic Dashboard Started', datetime.datetime.now())

PUA_url = 'https://oui.doleta.gov/unemploy/docs/weekly_pandemic_claims.xlsx'
pua_data = pd.read_excel(PUA_url)
fileloc = config.fileloc

y2k = '2000-01-01'
cy = '2020-01-01'
rs = '2020-02-01'  #Recession start date


#%% Overall Trends
def overall_trends():
    logging.info('%s Overall Trends Started', datetime.datetime.now())
    series = ['RSAFS', 'IPMAN', 'PAYEMS', 'DGORDER']
    national_trends = fred_chart(series,
                                 '2019-01-01',
                                 transformation='index',
                                 transform_date=rs,
Example #44
0
def second(batch):
   
    index_col_2_yr="B.TECH. II Yr.(III SEMESTER TIMETABLE) ODD SEMESTER 2018(Combined) JIIT128(Effective from 17/07/2018)"

    data=pd.read_excel("timetable2.xlsx", index_col=index_col_2_yr)
    #sperating cols

    data.columns=[1,2,3,4,5,6,7,8,9]

    data.columns.name=" "

    #seprating days

    mon=data.loc["MON":"TUE"].iloc[:-1]
    tue=data.loc["TUE":"WED"].iloc[:-1]
    wed=data.loc["WED":"THURS"].iloc[:-1]
    thu=data.loc["THURS":"FRI"].iloc[:-1]
    fri=data.loc["FRI":"SAT"].iloc[:-1]
    sat=data.loc["SAT":].iloc[:-1]



    #list of df
    data2=[mon,tue,wed,thu,fri,sat]

    final=data.dropna()
    #data2
    #final=data.dropna()
    #final
    #edit data frame here . make first row the column labels

    rows=[]



    #realgame
    #move all to class and fns

    for i in range (0,6):
        newlist=[]
        for j in range(1,10):
            new=data2[i][j].dropna()
            new2=new.str.contains(batch)
            new3=new.str.contains('ALL')
            new2=new2|new3
            if not ((new[new2]).empty):
                temp=new[new2].tolist()[0].replace("\n","")
                
                #comment to show subject code
                # temp1=temp.find('(')
                # temp2=temp.find(')')
                # temp=temp[:temp1]+temp[temp2+1:]
                #comment to show subject code
                
                temp3=temp.find('/')
                temp=temp[:temp3]
                newlist.append(temp)
                #method 2
                #newlist.append(new[new2].tolist().replace('\n',''))
            else:
                newlist.append(" ")
        rows.append(newlist)

        
    #final.append(new[new2])
    #final

    final=pd.DataFrame(rows,index=["Mon","Tue","Wed","Thu","Fri","Sat"],columns=data.iloc[0])
    final.columns.name="Days/Time"

    return(final.transpose().to_dict('list'))
Example #45
0
# -*- coding: utf-8 -*-
# tf_idf1
#
# test script for setup of tf-idf
#

import os
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

#add a project name for output
project_name = 'Daily Mail Property Articles'

# read sf export
sf_export = pd.read_excel(r'C:\Users\JLee35\Automation\TF-IDF\input\body_copy.xlsx')

corpus =[]
for i in sf_export.index:
    doc = sf_export['Body Copy Full'][i]
    corpus.append(doc)

# counts the length of the list
doc_count = len(corpus)
print(f'Total number of documents = {doc_count}')

# use TfidfVectorizer from Scikit-Learn to transform the corpus
stop = stopwords.words('english')
vectorizer = TfidfVectorizer(max_df=.65, min_df=1, ngram_range=(1,1), stop_words=stop, use_idf=True, norm=None)
transformed_documents = vectorizer.fit_transform(corpus)
transformed_documents_as_array = transformed_documents.toarray()
import numpy as np
import pandas as pd
import streamlit as st
import altair as alt


df=pd.read_excel('Base_sondage_maraichage.xlsx', index_col="Identifiant", na_values=['NA'])

df = df.fillna({"Mode_irrigation": "Pluvial"})

cleanup_nums = {

"Mode_Production":     {"Principale": 1, "En succession": 2, 
                                        "En association": 3, "Sous étage": 4},
    
"Mode_irrigation": {"Localisée": 1, "Gravitaire": 2, "Aspersion": 3,
                                    "Pivot": 4,
                                  "Gravitaire,Localisée": 5, "Localisée,Pivot": 6, "Pluvial":7},
    
"Culture": {"Courgette": 1, "Pomme de terre": 2, "Tomate": 3,
                                    "Coriandre et persil": 4,
                                  "Haricot vert": 5, "Concombre": 6,
           "Menthe": 7, "Fève vert": 8, "Aubergine": 9,
                                    "Carotte": 10,
                                  "Chou fleur": 11, "Oignon":12, "Choux vert":13, "Celeri": 14,
            "Laitue": 15, "Tomate kiwat": 16, "Fraise": 17,
                                    "Piment fort": 18,
                                  "Artichaut": 19, "Absinthe": 20,
            "Haricot Helda": 21, "Topinambour": 22, "Myrtille": 23,
                                    "Endive": 24,
                                  "Navet": 25, "Pastèque":26, "Poivron": 27},
Example #47
0
"""
参考:
http://pbpython.com/market-basket-analysis.html

"""
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

df = pd.read_excel('http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx')
print(df.head())

df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]


basket = (df[df['Country'] =="France"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

basket_sets = basket.applymap(encode_units)
basket_sets.drop('POSTAGE', inplace=True, axis=1)
Example #48
0
#Importación de librerías
import plotly.offline as pyo
import plotly.graph_objs as go
import pandas as pd

#Carga de datos
df_temp = pd.read_excel(
    r'C:\Users\ivan_pinar\Dropbox\Creación de MOCs\MOC Dash Python\Datasets\3.8\Temperaturas.xlsx'
)

#Definición de objeto de tipo lista "data", x --> Categorízación, y --> Valores a verificar distribución
data = [
    go.Box(x=df_temp["Ciudad"], y=df_temp["T_Promedio"])
]  #pointpos=0 para ubicación de los puntos en el centro / boxpoints ="all" si se quieren visualizar todos los puntos

#Definición de objeto "layout": diseño del gráfico como título, nombres de ejes,...
layout = go.Layout(title="Box & whiskers Temperatura")

#Creación de objeto "Figure" de Plotly a partir de los objetos data y layout creados previamente
fig = go.Figure(data=data, layout=layout)
#Generación del plot a partir de la figura definida y nombre del fichero de salida HTML
pyo.plot(fig, filename="3.8 Temp_Box Plot.html")
print('**************************************************    Analyzing {:s}'.
      format(state.upper()))
print()

#excel = r'K:\DEEP_SOLAR_BIG2\TN\TNDeep_Solar.xlsx'
if region == TVA_l:
    #excel = r'K:\TVA_SVI\TVA_DS_SVI_merged.xlsx'
    excel = r'K:\TVA_SVI\TVA_DS_SVI_merged2.xlsx'
elif region == TVA_f:
    #excel = r'K:\TVA_SVI\TVA_DS_SVI_merged.xlsx'
    excel = TVA_path
else:
    excel = r'K:\TVA_SVI\TN_DS_SVI_merged.xlsx'
    #TNDS = pd.read_excel(excel, index='fips').fillna(0)
    #TNDS = pd.read_excel(excel, index='fips').dropna(axis=0)
TNDS = pd.read_excel(excel, index='fips')
#TNDS = pd.read_excel(excel, index='fips')
#TNDS = TNDS.fillna(TNDS.mean(axis=1))

if region != TVA_l and region != TVA_f:
    print('Getting region {:s}'.format(region))
    TNDS = TNDS.loc[TNDS['ST_ABBR'] == region.upper()]

print('splitting data')
adopters, high, mod, non = split_data_res_adopt_non(TNDS,
                                                    hthr=10,
                                                    midrange=[1, 10],
                                                    lthr=1,
                                                    verbose=False)
dd.display_percentages(TNDS.shape[0],
                       adopters.shape[0],
Example #50
0
def disp1():
    import io
    from pdfminer.converter import TextConverter
    from pdfminer.pdfinterp import PDFPageInterpreter
    from pdfminer.pdfinterp import PDFResourceManager
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    text = ''

    def extract_text_from_pdf(pdf_path):
        with open(pdf_path, 'rb') as fh:
            # iterate over all pages of PDF document
            for page in PDFPage.get_pages(fh,
                                          caching=True,
                                          check_extractable=True):
                # creating a resoure manager
                resource_manager = PDFResourceManager()

                # create a file handle
                fake_file_handle = io.StringIO()

                # creating a text converter object
                converter = TextConverter(resource_manager,
                                          fake_file_handle,
                                          codec='utf-8',
                                          laparams=LAParams())

                # creating a page interpreter
                page_interpreter = PDFPageInterpreter(resource_manager,
                                                      converter)

                # process current page
                page_interpreter.process_page(page)

                # extract text
                text = fake_file_handle.getvalue()
                yield text

                # close open handles
                converter.close()
                fake_file_handle.close()

    # calling above function and extracting text
    #print(fname)
    file_path = "D:/resume_analysis/static/resumes/" + fname
    fp = file_path.split('/')
    f = fp[len(fp) - 1]
    for page in extract_text_from_pdf(file_path):
        text += ' ' + page
    #print(text)
    import spacy
    from spacy.matcher import Matcher

    # load pre-trained model
    nlp = spacy.load('en_core_web_sm')

    # initialize matcher with a vocab
    matcher = Matcher(nlp.vocab)

    def extract_name(resume_text):
        nlp_text = nlp(resume_text)

        # First name and Last name are always Proper Nouns
        pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]

        matcher.add('NAME', None, pattern)

        matches = matcher(nlp_text)

        for match_id, start, end in matches:
            span = nlp_text[start:end]
            return span.text

    name = extract_name(text)
    #print(name)
    import re

    def extract_mobile_number(text):
        phone = re.findall(
            re.compile(
                r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'
            ), text)

        if phone:
            number = ''.join(phone[0])
            if len(number) > 10:
                return '+' + number
            else:
                return number

    num = extract_mobile_number(text)
    #print(num)
    import re

    def extract_email(email):
        email = re.findall("([^@|\s]+@[^@]+\.[^@|\s]+)", email)
        if email:
            try:
                return email[0].split()[0].strip(';')
            except IndexError:
                return None

    email = extract_email(text)
    #print(email)
    import pandas as pd
    import spacy
    #from spacy.en import English
    # load pre-trained model
    nlp = spacy.load('en_core_web_sm')
    #noun_chunk = nlp.noun_chunks
    #nlp=English()
    doc = nlp(text)

    def extract_skills(resume_text):
        nlp_text = nlp(resume_text)

        # removing stop words and implementing word tokenization
        tokens = [token.text for token in nlp_text if not token.is_stop]
        #print(tokens)
        # reading the csv file
        data = pd.read_csv("D:/resume_analysis/techskill.csv")

        # extract values
        skills = list(data.columns.values)

        skillset = []

        # check for one-grams (example: python)
        for token in tokens:
            if token.lower() in skills:
                skillset.append(token)

        # check for bi-grams and tri-grams (example: machine learning)
        for token in doc.noun_chunks:
            token = token.text.lower().strip()
            if token in skills:
                skillset.append(token)

        return [i.capitalize() for i in set([i.lower() for i in skillset])]

    text = text.lower()
    skill = extract_skills(text)
    print(skill)

    skill_len = len(skill)
    excel_file = 'D:/resume_analysis/jd.xls'
    jd = pd.read_excel(excel_file)
    skill1 = jd['Skills']
    row = jd.shape[0]
    res = []

    for i in range(row):
        count = 0
        sk = skill1[i].split(',')
        for j in skill:
            if (skill1[i].find(j) != -1):
                count = count + 1

        res.append(100 * count / len(skill))
    ind = res.index(max(res))
    print(jd['JobTitle'][ind])
    res1 = []
    for i in res:
        res1.append(i)
    res1 = sorted(res1)
    p1 = max(res)
    print(res1)
    second = res1[len(res1) - 1]
    third = res1[len(res1) - 2]
    print(max(res), second, third)
    res[ind] = -res[ind]
    ind1 = res.index(second)
    res[ind1] = -res[ind1]
    ind2 = res.index(third)
    print(ind1)
    s1 = jd['Skills'][ind]
    s2 = jd['Skills'][ind1]
    s3 = jd['Skills'][ind2]
    rs1, rs2, rs3 = [], [], []

    for j in skill:
        if (j in s1):
            rs1.append(j)
    for j in skill:
        if (j in s2):
            rs2.append(j)
    for j in skill:
        if (j in s3):
            rs3.append(j)
    return render_template('car.html',
                           job1=jd['JobTitle'][ind],
                           skills1=rs1,
                           job2=jd['JobTitle'][ind1],
                           skills2=rs2,
                           job3=jd['JobTitle'][ind2],
                           skills3=rs3,
                           p1=round(p1, 3),
                           fnam=f,
                           p2=round(second, 3),
                           p3=round(third, 3))
Example #51
0
def read_file(path,name):
  df = pd.read_excel('%s/%s'%(path,name))
  return df
driver.find_element_by_name('password').send_keys('t4')
time.sleep(.2)
driver.find_element_by_name('password').send_keys('78')
time.sleep(.2)
driver.find_element_by_name('password').send_keys('@g')
time.sleep(.2)
driver.find_element_by_name('password').send_keys('ma')
time.sleep(.2)
driver.find_element_by_name('password').send_keys('il')
time.sleep(.2)
driver.find_element_by_name('password').send_keys('.c')
time.sleep(.2)
driver.find_element_by_name('password').send_keys('om')
driver.find_element_by_name('password').send_keys(Keys.RETURN)
time.sleep(60)
abc = pd.read_excel('C:\\Users\\acer\\Downloads\\cds\\mat.xls', header=None, index_col=False)
f = open('data.csv', 'a')
var=0
continueCheck = False
for item in abc.index:
    print(abc[0][item])
    website = str(abc[0][item])
    if website == '6annonce.com':
        continueCheck = True
        continue

    if continueCheck:
        driver.get('https://pro.similarweb.com/#/website/worldwide-overview/'+website+'/*/999/3m?webSource=Total')
        # try:
        if driver.title != 'Pardon Our Interruption':
            wait = WebDriverWait(driver, 40)
def createRUID_List(rowIdxList, headerStr):
    """
    Loops over a series containing row indices and returns a list of RUID strings.
    Inputs:
      rowIdxList - collection of row index values 
      headerStr - DataFrame header string value for column containing RUIDs
    Outputs:
      new list containing RUID strings
    """
    RUID_List = []
    for aRowIdx in rowIdxList:
        workingRUID=df[headerStr].iloc[aRowIdx]
        RUID_List.append(workingRUID)
    return RUID_List

df = pd.read_excel("abcd_rucdr_master_forPython.xlsx")
print ('Finished reading in input file.')

#blackList=['NDAR_INV']
#for pattern in blackList:
#    df['pGUID_Rutgers'] = df['pGUID_Rutgers'].replace(pattern, '')
    
#datasets
Unique_DAIC_Invs = df['InvCodeDAIC_OnlyTxt'].dropna()
Unique_Rutgers_Invs = df['InvCodeRUCDR_OnlyTxt'].dropna()
AllRutgersInvs = df['InvCodeMinusDOTxt'].dropna()
AllDAIC_Invs = df['InvCodeMinusROTxt'].dropna()



print ('About to start first match2collections.') 
Example #54
0
def main():
    df = pd.read_excel(INPUT_FILE)
    wb = op.Workbook()
    ws = wb.active
    ws.append(['Месяц'] + ['Параметр'] + [i for i in range(1, 32)])

    wells = list(dict.fromkeys(df.well))

    # Iterate over well
    for well in wells:
        print(well)
        ws.append([well])
        last_row = ws.max_row
        ws.merge_cells(start_row=last_row,
                       start_column=1,
                       end_row=last_row,
                       end_column=33)

        well_df = df[df.well == well]
        years = list(dict.fromkeys(well_df.date.dt.year))
        # Iterate over year
        for year in years:
            year_df = well_df[well_df.date.dt.year == year]
            months = list(dict.fromkeys(year_df.date.dt.month))
            for month in months:
                month_df = year_df[year_df.date.dt.month == month]

                # Blank rows
                q = [None for i in range(31)]
                dynamic = [None for i in range(31)]
                static = [None for i in range(31)]
                # Iterate over month data
                for _, row in month_df.iterrows():
                    day = row.date.day
                    q[day - 1] = round(row.rate, 1)
                    dynamic[day - 1] = round(row.dynamic, 1)
                    static[day - 1] = round(row.static, 1)

                # Write rows to sheet
                ws.append([f'{months_names[month - 1]} {year}'] +
                          ['Q, м3/сут'] + q)
                ws.append([None] + ['Нд, м'] + dynamic)
                ws.append([None] + ['Нст, м'] + static)
                last_row = ws.max_row
                ws.merge_cells(start_row=last_row - 2,
                               start_column=1,
                               end_row=last_row,
                               end_column=1)

    # Apply styles
    for row in ws.iter_rows():
        for cell in row:
            cell.style = style_basic

        if row[0].value is not None and row[1].value is None:
            row[0].style = style_bold

    for i in range(2, 33):
        ws.column_dimensions[get_column_letter(i + 1)].width = 6

    wb.save(OUTPUT_FILE)
Example #55
0
def on_button_clicked(b):

    clear_output()
    display(button)
## UPLOADED INITIAL DATA
#     if mv.value !='':
#         try:

    data = pd.read_excel('story_'+ story.value+'/story'+ story.value+'.xlsx', sheet_name='sample')
    # data=data.drop(['FC_D','FC_E','FC_F'],axis=1)
    data['Departure_Date']=pd.to_datetime(data.Departure_Date)
    datadone=data[data.Departure_Date< pd.Timestamp(nowaday.value)]
    data=data[data.Departure_Date>= pd.Timestamp(nowaday.value)]
    ## 1. Sort by Arival Date and Priority
    data=data.sort_values(by=['Arrival_Date','Price'], ascending=[True,False])

    ## 2. Set Parameter and Constraint
    #Total Floating Crane
    a=1
    b=1
    c=1
    totfc = int(fcnumber.value)
    fclist=['FC_A','FC_B','FC_C']

    #### Create feature demanddays for 1 floating crane
    data['demanddays']= np.round(data.Demand_Qty/data.Loading_Rate)
    data['demandfc']=np.ceil(data['demanddays']/data.Laytime_Duration)
    data['demanddays_new']=np.ceil(data.Demand_Qty/(data.Loading_Rate*data['demandfc']))
    ## 3. Assign Floating Crane - Initial Plan
    # to get initial plan

    ### create initial first row
    import itertools
    a=[]
    for L in range(1, len(fclist)+1):
        for subset in itertools.combinations(fclist, L):
    #         print(subset)
            x=list(subset)
            a.append(x)
    a=[[1,0,0],
     [0,1,0],
     [0,0,1],
     [1,1,0],
     [1,0,1],
     [0,1,1],
    ]
    a=pd.DataFrame(a,columns=['FC_A', 'FC_B', 'FC_C'])

    if (data.loc[0,'demandfc']==1) == True:
        data.loc[0, 'FC_A'] = 1
        data.loc[0, 'FC_B'] = 0
        data.loc[0, 'FC_C'] = 0
    elif (data.loc[0,'demandfc']==2) == True :
        data.loc[0, 'FC_A'] = 1
        data.loc[0, 'FC_B'] = 1
        data.loc[0, 'FC_C'] = 0
    else:
        data.loc[0, 'FC_A'] = 1
        data.loc[0, 'FC_B'] = 1
        data.loc[0, 'FC_C'] = 1

    ### complete initial plan
    for i in range(1,data.shape[0]):

        if (data.loc[i,'demandfc'] == 1):
            data.loc[i, 'FC_A'] = 1
            data.loc[i, 'FC_B'] = 0
            data.loc[i, 'FC_C'] = 0
        elif (data.loc[i,'demandfc'] == 2) :

            for fc in range(a.shape[0]):
                if ((data.loc[i-1,'FC_A'])== (a.loc[fc,'FC_A'])) & ((data.loc[i-1,'FC_B'])== (a.loc[fc,'FC_B'])) & ((data.loc[i-1,'FC_C'])== (a.loc[fc,'FC_C'])): 
                    data.loc[i, 'FC_A'] = np.abs((a.loc[fc,'FC_A'])-1)
                    data.loc[i, 'FC_B'] = np.abs((a.loc[fc,'FC_B'])-1)
                    data.loc[i, 'FC_C'] = np.abs((a.loc[fc,'FC_C'])-1)
                    if ((data.loc[i, 'FC_A'] + data.loc[i, 'FC_B'] +data.loc[i, 'FC_C'] )==1) & (data.loc[i, 'FC_A']==0):
                        data.loc[i, 'FC_A']=1
                    elif ((data.loc[i, 'FC_A'] + data.loc[i, 'FC_B'] +data.loc[i, 'FC_C'] )==1) & (data.loc[i, 'FC_B']==0):
                        data.loc[i, 'FC_B']=1
                    elif ((data.loc[i, 'FC_A'] + data.loc[i, 'FC_B'] +data.loc[i, 'FC_C'] )==1) & (data.loc[i, 'FC_C']==0):
                        data.loc[i, 'FC_C']=1
                    else:continue
                else:continue
        else:
            data.loc[i, 'FC_A'] = 1
            data.loc[i, 'FC_B'] = 1
            data.loc[i, 'FC_C'] = 1  
    ## 4. Recalculate Departure Date
    #     based on real demanddays_new

    data['Arrival_Date_change']=pd.to_datetime(np.nan)
    data['Departure_Date_change']=pd.to_datetime(np.nan)
    data.loc['FC_gap_Date_change'] = pd.to_datetime(np.nan)


    data=data[['MV', 'ETA','Arrival_Date', 'Laytime_Duration', 'Departure_Date',
       'Demand_Qty', 'Loading_Rate', 'Price', 'Demurrage_Rate', 'demanddays',
       'demandfc', 'demanddays_new',
               'FC_A', 'FC_B', 'FC_C',
                   'FC_D','FC_E','FC_F',
       'Arrival_Date_change', 'Departure_Date_change']]
    datachange=pd.DataFrame([[mv.value,arvl.value]],columns=['MV','Arrival_Date_change_source'])
    datachange['Arrival_Date_change_source']=pd.to_datetime(datachange.Arrival_Date_change_source)
    data=pd.merge(data,datachange,how='left',on=['MV'])
    data['Arrival_Date']=pd.to_datetime(data.Arrival_Date)
    data['Departure_Date']=pd.to_datetime(data.Departure_Date)
    # data['Arrival_Date_change']=pd.to_datetime(data.Arrival_Date_change)
    data['Arrival_Date_change']=data['Arrival_Date_change_source']
    data['Est_Departure_Date_change']=data['Arrival_Date_change']+pd.to_timedelta(data['demanddays_new'], unit='D')
    data['Departure_Date_change']=data['Arrival_Date_change']+pd.to_timedelta(10, unit='D')

    # data['Departure_Date_change']=pd.to_datetime(data.Departure_Date_change)
    data.loc[data.Arrival_Date_change.isnull() ,'Arrival_Date_change']=data.loc[data.Arrival_Date_change.isnull()  ,'Arrival_Date']
    data['Est_Departure_Date_change']=data['Arrival_Date_change']+pd.to_timedelta(data['demanddays_new'], unit='D')# data.loc[0, 'Arrival_Date_change']=pd.to_datetime(data.loc[0,'Arrival_Date_change_source'])
    # data.loc[0,'Departure_Date_change']=data.loc[0,'Arrival_Date_change']+pd.to_timedelta(data.loc[0,'demanddays_new'], unit='D')
    data['Departure_Date_change']=data['Arrival_Date_change']+pd.to_timedelta(10, unit='D')

    data.drop('Arrival_Date_change_source',axis=1,inplace=True)
    x=datachange['MV'][0]

    ### 6. Check the next sequence Schedule
    #     If the departure date change is clash, so FC_start_date_change must be adjusted and check the potential demorage cost

    ## Sort by Arival Date Change and Priority (Price)
    data=data.sort_values(by=['Arrival_Date_change','Price'], ascending=[True,False])
    data=data.reset_index()
    data.drop('index',axis=1,inplace=True)
    data['FC_Start_Date_change']=data['Arrival_Date_change']
    data['FC_End_Date_change']=data['Est_Departure_Date_change']

#             data.loc[data.MV== x ,'FC_Start_Date_change']=data.loc[data.MV== x ,'Arrival_Date_change']
#             data.loc[data.MV== x ,'FC_End_Date_change']=data.loc[data.MV== x ,'Est_Departure_Date_change']
#             data.loc[(data.MV!= x) & (data.FC_Start_Date_change.isnull()),'FC_Start_Date_change']=data.loc[data.MV!= x ,'Arrival_Date_change']
#             data.loc[(data.MV!= x) & (data.FC_End_Date_change.isnull()),'FC_End_Date_change']=data.loc[data.MV!= x ,'Est_Departure_Date_change']


    # Calculate Demurage cost
    data.loc[0,'Demmurage_Day']=0
    data.loc[0,'Demmurage_Cost']=0

    ### Create Demmuragecost Simulation Function

    def sim_demuragecost(totfc,data):
        totfc=totfc
        for i in range(1,data.shape[0]):
            #if previous iteration row value is greater than current iteration row value then
            if (data.loc[i-1,'Est_Departure_Date_change'] >= data.loc[i,'Arrival_Date_change']) :
                totfc=totfc-data.loc[i-1,'demanddays_new']
                #if available fc >= demand fc i
                if (totfc >= data.loc[i,'demanddays_new'] ):
                    data.loc[i,'FC_Start_Date_change'] = data.loc[i,'Arrival_Date_change']
                    data.loc[i,'FC_End_Date_change'] = data.loc[i,'Est_Departure_Date_change']
                    # Calculate Demurage cost
                    data.loc[i,'Demmurage_Day']=np.ceil((data.loc[i,'FC_End_Date_change'] - data.loc[i,'Departure_Date_change'])/np.timedelta64(1,'D'))
                    data.loc[i,'Demmurage_Cost']=data.loc[i,'Demurrage_Rate'] * data.loc[i,'Demmurage_Day']
#                     data.loc[i,'FC_gap_Date_change'] = data.loc[i,'Departure_Date_change'] + pd.to_timedelta(1, unit='D') 

#                             data.loc[i,'Demmurage_Day']=0 #will be no risk to get demurage day n cost
#                             data.loc[i,'Demmurage_Cost']=0
                #if available fc < demand fc i and fc is at least available for one then
                elif (totfc < data.loc[i,'demanddays_new']) & (totfc >0):
                    #state the available FC to start operate
                    data.loc[i,'FC_Start_Date_change'] = data.loc[i,'Arrival_Date_change']
                    data.loc[i,'FC_Start_Date_change_2'] = data.loc[i-1,'FC_End_Date_change'] + pd.to_timedelta(1, unit='D') #startdate next fc
                    #cal the number of days that available FC can start
                    data.loc[i,'dayrun_progress']=np.ceil((data.loc[i,'FC_Start_Date_change_2'] - data.loc[i,'FC_Start_Date_change'])/np.timedelta64(1,'D'))
                    #cal the remaining quantity that is already loaded by available FC
                    data.loc[i,'Demand_Qty_remain']= data.loc[i,'Demand_Qty'] - (data.loc[i,'Loading_Rate']*totfc*data.loc[i,'dayrun_progress'])
                    #cal the remaining number of FC to fulfill the demand
                    data.loc[i,'demandfc_remain'] = data.loc[i,'demanddays_new'] - totfc
                    #re-cal the total demandays based on this condition
                    data.loc[i,'demanddays_new']= np.ceil(data.loc[i,'Demand_Qty_remain'] / (data.loc[i,'Loading_Rate']*data.loc[i,'demanddays_new'])) +data.loc[i,'dayrun_progress']
                    #cal the end date fc operate
                    data.loc[i,'FC_End_Date_change'] = data.loc[i,'FC_Start_Date_change'] + pd.to_timedelta(data.loc[i,'demanddays_new'], unit='D')
                    # Calculate Demurage cost
                    data.loc[i,'Demmurage_Day']=np.ceil((data.loc[i,'FC_End_Date_change'] - data.loc[i,'Departure_Date_change'])/np.timedelta64(1,'D'))
                    data.loc[i,'Demmurage_Cost']=data.loc[i,'Demurrage_Rate'] * data.loc[i,'Demmurage_Day']
#                     data.loc[i,'FC_gap_Date_change'] = data.loc[i,'Departure_Date_change'] + pd.to_timedelta(1, unit='D') 

                #if available fc < demand fc i and fc none available then
                else:
                    #the fc must start till the previous mv finisih to load
                    data.loc[i,'FC_Start_Date_change'] = data.loc[i-1,'FC_End_Date_change'] + pd.to_timedelta(1, unit='D') 
                    data.loc[i,'FC_End_Date_change'] = data.loc[i,'FC_Start_Date_change'] + pd.to_timedelta(data.loc[i,'demanddays_new'], unit='D')
                    # Calculate Demurage cost
                    data.loc[i,'Demmurage_Day']=np.ceil((data.loc[i,'FC_End_Date_change'] - data.loc[i,'Departure_Date_change'])/np.timedelta64(1,'D'))
                    data.loc[i,'Demmurage_Cost']=data.loc[i,'Demurrage_Rate'] * data.loc[i,'Demmurage_Day']
#                     data.loc[i,'FC_gap_Date_change'] = data.loc[i,'Departure_Date_change'] + pd.to_timedelta(1, unit='D') 
                totfc=3 #reset to initial total fc
            else:
                totfc = 3
                data.loc[i,'FC_Start_Date_change'] = data.loc[i,'Arrival_Date_change']
                data.loc[i,'FC_End_Date_change'] = data.loc[i,'Est_Departure_Date_change']
                data.loc[i,'Demmurage_Day']=np.ceil((data.loc[i,'FC_End_Date_change'] - data.loc[i,'Departure_Date_change'])/np.timedelta64(1,'D'))
                data.loc[i,'Demmurage_Cost']=data.loc[i,'Demurrage_Rate'] * data.loc[i,'Demmurage_Day']
#                 data.loc[i,'FC_gap_Date_change'] = data.loc[i,'Departure_Date_change'] + pd.to_timedelta(1, unit='D') 

        data.loc[data.Demmurage_Day<=0 ,'Demmurage_Day']=0
        data.loc[data.Demmurage_Cost<=0 ,'Demmurage_Cost']=0
#         data.loc[data.Demmurage_Cost<=0 ,'FC_gap_Date_change']=data.loc[data.Demmurage_Cost<=0 ,'FC_End_Date_change']
        return data
    ### Call function
    data=sim_demuragecost(totfc,data)
    data

    def gantt_fig3(data):
        data3 = []
        for row in data.itertuples():
            data3.append(dict(Task=str(row.MV), Start=str(row.Arrival_Date_change),
                          Finish=str(row.Departure_Date_change), Resource='Plan'))
            data3.append(dict(Task=str(row.MV), Start=str(row.FC_Start_Date_change),
                          Finish=str(row.FC_End_Date_change), Resource='Actual'))


        fig = ff.create_gantt(data3, index_col='Resource', title='Gantt Chart', show_colorbar = True, group_tasks = True , height=500, width=1300 )
        fig['layout'].update(legend=dict(traceorder='reversed'))
        return fig

    iplot(gantt_fig3(data))
    data=pd.concat(datadone,data)
    newtable=data
    posttable=data
    newtable.columns
    newtable['Arrival_Date']=newtable.Arrival_Date_change
    newtable['Departure_Date']=newtable.Departure_Date_change
    tab=newtable[['MV', 'ETA', 'Arrival_Date', 'Laytime_Duration', 'Departure_Date',
       'Demand_Qty', 'Loading_Rate', 'Price',
                  'FC_A', 'FC_B', 'FC_C', 'FC_D','FC_E', 'FC_F', 
                  'Demmurage_Day', 'Demurrage_Rate', 'Demmurage_Cost']]
    tab.to_excel('story_'+ story.value+'/story'+ story.value+'.xlsx',sheet_name='sample',engine='xlsxwriter',index=False)

    data.drop(['demanddays'],axis=1,inplace=True)
    data.rename(columns={'demanddays_new':'demanddays'},inplace=True)
    print( 'Total demurage cost: USD ' +str(data.Demmurage_Cost.sum()))
    button
    data.dropna(axis=0, how='all', thresh=None, subset=None, inplace=True)
    data=data.drop(['FC_A', 'FC_B', 'FC_C', 'FC_D','FC_E', 'FC_F'],axis=1)
    data

    return button,  display(data),data;
Example #56
0
for data_file in data_files:
    if ".zip" in data_file:
        data_file_name = data_file[:-4]

        if extract_zip_files:
            with ZipFile("%s/%s" % (DATA_DIR, data_file), 'r') as zipObj:
                listOfFileNames = zipObj.namelist()
                fileName = listOfFileNames[0]
                zipObj.extractall("/tmp")
                os.replace("/tmp/%s" % fileName,
                           "/tmp/%s.xls" % data_file_name)

        xl = pd.ExcelFile("/tmp/%s.xls" % data_file_name)
        sheet_name = xl.sheet_names[0]
        df = pd.read_excel(xl,
                           sheet_name,
                           usecols=[NAME, DATE, INTEREST, LONG, SHORT])

        name_list += list(df[NAME])
        date_list += list(df[DATE])
        interest_list += list(df[INTEREST])
        long_list += list(df[LONG])
        short_list += list(df[SHORT])

num_of_entries = len(name_list)

z_scores_one_year = []
z_scores_three_year = []

cwd = os.getcwd()
Example #57
0
import requests
import csv
import json
import matplotlib.pyplot as plt
import seaborn as sns

'''
Åldersgrupp       : Agegroup
Antal vaccinerade : Number of vaccinated
Andel vaccinerade : Proportion of vaccinated
Dosnummer         : Dosenumber
'''

xls = pd.ExcelFile("https://fohm.maps.arcgis.com/sharing/rest/content/items/fc749115877443d29c2a49ea9eca77e9/data")

xls1 = pd.read_excel(xls, 'Vaccinerade ålder')

#print(xls1.columns)

# Let's drop unnamed column from the dataframe
df = xls1.drop("Unnamed: 5", axis=1)
#print(df.columns)

#Now we will seperate dose1 and dose2 of entire sweden
df_sweden_dose1 = df.loc[0:8]
print(df_sweden_dose1)

df_sweden_dose2 = df.loc[9:17]
#print(df_sweden_dose2)

Example #58
0
import pandas

df = pandas.read_excel('712693030RPKUP4RX.xlsx')
header = df.iloc[2]  #取得標題
df1 = df[3:].copy()  #去除前三列
df1 = df1.rename(columns=header)  #重置標題
df2 = df1.drop(columns=['縣市代碼', '村里代碼', '村里名稱', '村里代碼'], axis=1)  #去除四行資料
df3 = df2.drop_duplicates()  #移除重複資料

df3.to_csv('district.csv', encoding='big5', index=False)
Example #59
0
from ics_data_clean import clean_data, clean_text_round1, process_data
from ics_train_classifier import train_
from plotly.graph_objs import Bar, Scatter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# pickle load important files
category_id_df = pd.read_pickle('../pickle/ics/factorize/category_id_df.pkl')
id_to_category = dict(category_id_df[['category_id', 'category']].values)

with open('../pickle/ics/stop_words.pickle', 'rb') as f:
    stop_words = pickle.load(f)
# training_database_table = pd.read_pickle('../pickle/ics/training_database_index.pkl')

# process list
process_list = pd.read_excel('../data/ics/processes.xlsx')

# load training data
df = pd.read_pickle('../pickle/ics/data_final.pkl')

# define vectorizers
tfidf = TfidfVectorizer(analyzer='word',
                        sublinear_tf=True,
                        norm='l2',
                        encoding='latin-1',
                        ngram_range=(1, 2),
                        stop_words=stop_words)
tfidf.fit(df.content)
cv = CountVectorizer(stop_words=stop_words)

tfidf_s = TfidfVectorizer(analyzer='word',
import pandas as pd
import numpy as np
import sys

#读取excel里的数据
df=pd.read_excel('E:/PythonStudy_Git/调用资料/file/菜品报表 (1).xlsx',sheet_name = 0)
#增加时间
df['时间']='2020-10-02'

df2=pd.read_excel('E:/PythonStudy_Git/调用资料/file/菜品报表 (2).xlsx',sheet_name = 0)
print(df2)

'''合并两表,how:连接方式:
有inner(根据条件,如同等id号的合并成一条记录
left左合并
right右合并
outer下合并
默认为inner;'''
df3=pd.merge(df,df2,how="outer")
print(df3)