コード例 #1
0
ファイル: titanic.py プロジェクト: maciejjaskowski/titanic
def submission(df, df_test, test_passenger_ids):
  output, forest = train_decision_tree(df[0::,1::], df[0::,0], df_test)
  csv = pd.concat([pd.DataFrame(test_passenger_ids), pd.DataFrame({'Survived': output.astype(int)})], axis=1)

  print forest.feature_importances_
  
  csv.to_csv("submission.csv", index = False)
コード例 #2
0
def removestop(csvpath, stop):
    csv = pd.read_csv(csvpath)
    mes = csv['message']
    cl = csv['Coding:Level1']
    cidx = []
    iidx = []
    aidx = []
    for i in range(len(mes)):
        # for i in range(1):
        if not isinstance(mes[i], str):
            if math.isnan(mes[i]):
                mes[i] = ""
        mes[i].strip()
        m = nl.tokenize.word_tokenize(mes[i])
        sentence = []
        for w in m:
            w = w.lower()
            if w not in stop:
                sentence.append(w)
        mes[i] = sentence
        if cl[i] == 'Information':
            iidx.append(i)
        elif cl[i] == 'Community':
            cidx.append(i)
        elif cl[i] == 'Action':
            aidx.append(i)
        # print(sentence)

    for i in range(len(cl)):
        csv['message'] = mes
    csv.to_csv("data/stopsremoved.csv", index=False)

    return mes, cl, cidx, iidx, aidx
コード例 #3
0
def add_value_averages(length, mapped_dict, mfd):
    if 'CSVs' in os.listdir(os.getcwd()):
        os.chdir('CSVs')

    for file in os.listdir(os.getcwd()):
        if file.startswith(length):
            csv = pd.read_csv(file)

    for key, value in mapped_dict.items():
        word_list = []
        for word in value:
            if word in mfd:
                word.strip()
                all_word = word + " (All)"
                if all_word in csv.columns.values:
                    word_list.append(all_word)
                elif word in csv.columns.values:
                    word_list.append(word)
        if word_list:
            csv[key] = csv[word_list].mean(axis=1)
            print(key)
            print(csv[key])

    file_name = length + "-MFD-with-averages.csv"
    csv.to_csv(file_name, index=False)
コード例 #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("path", type=str, help="Dataset path (*.csv)")
    parser.add_argument("destino_path", type=str, help="Path to save the file (*.csv)")
    args = parser.parse_args()
    
    path = args.path
    destino_path = args.destino_path
    #read csv  
    csv = pd.read_csv(path)
    #csv = clean_csv(csv).clean(600,'not', 'not', confi = None)
    #csv = clean_csv(csv).clean(600,'yes', 'not', confi = None)
    csv = clean_csv(csv).clean(700,'yes', 'yes',3, confi = 0.65)
    csv = csv.reset_index(drop=True)
    
    #preprocessing each list
    p1 = Preprocessing()
    lista_query = p1.preprocess_text(csv['query'].tolist())
    lista_response = p1.preprocess_text([str(i) for i in csv['response'].tolist()])
    
    #delete query and response columns
    csv.drop(['query', 'response'], axis=1, inplace=True)
    #add query and response columns
    csv['query'] = lista_query
    csv['response'] = lista_response
    csv = csv.reset_index(drop=True)
    #save csv file
    csv.to_csv(destino_path + 'dataset_confidence_065__3.csv', index=False)
コード例 #5
0
 def saveCSV(self, filename, directory):
     os.chdir(directory)
     csv = self.csv[0]
     csv.to_csv(
         filename,
         encoding='utf-8',
     )
コード例 #6
0
def calculate_rating(line):
    csv = pd.read_csv('ratings.csv', sep=',')
    row = csv.loc[line]
    print(row)
    curr_rating = (float(row[2]) * 5 + float(row[3]) * 4 + float(row[4]) * 3 +
                   float(row[5]) * 2 + float(row[6]) * 1) / float(row[1])
    print(round(curr_rating, 2))
    return round(curr_rating, 2)
    csv.loc[line, 'Rating'] = curr_rating
    csv.to_csv("ratings.csv", index=False)
コード例 #7
0
def removelinks(csvpath):
    csv = pd.read_csv(csvpath)

    mes = csv['message']
    for i in range(len(mes)):
        if "http" in mes[i] or "www" in mes[i]:
            mes[i] = regex.sub("http\S+", "", mes[i])
            mes[i] = regex.sub("www\S+", "", mes[i])

    csv['message'] = mes
    csv.to_csv("data/linksremoved.csv", index=False)
コード例 #8
0
def main():
    query = input('Introduce query: ')
    num_items = check_query(query)
    action = input(
        f'The query returned {num_items} results. Do you want to continue? [y/n] '
    )
    if 'y' in action or 'Y' in action:
        csv = get_csv(num_items, query)
        ts = int(time.time())
        csv.to_csv(f'sample_data/{ts}.csv')
        print(f'Data saved to {ts}.csv')
    else:
        print('Aborting...')
コード例 #9
0
def process_csv(filename, outfile):
    csv = pd.read_csv(filename, names=['timeStamp', 'rawData'])
    csv.drop(csv.index[0], inplace=True)
    csv.reset_index(inplace=True)

    csv.drop(['index'], axis=1, inplace=True)
    csv['timeStamp'] -= csv['timeStamp'][0]
    csv['rawData'] = csv['rawData'] - float((csv['rawData'].mode()))

    ## Add endTimeStamp ##
    # tmp = csv['timeStamp'].shift(-1).fillna(0)
    # csv['endTimeStamp'] = tmp
    # csv = csv[:-1]
    # csv['offset'] = csv['endTimeStamp'] - csv['timeStamp']
    csv.to_csv(outfile, sep=',', index=False)
コード例 #10
0
ファイル: main.py プロジェクト: inap-bannai/geolocation
def writeLatLngToCSV(files: dict):
    # 日付ごとに output
    # ファイルがなければ追加
    # my map では lat lng でプロットするため、それらとタイトルと日付等のメタデータを行にする
    outputFileName = './output/{0}.csv'.format(date.today())
    if not os.path.exists(outputFileName):
        touch(outputFileName)

    csv = pd.read_csv(outputFileName, sep='\t', names=header)

    for _, locations in files.items():
        for location in locations:
            csv = pd.concat(
                [csv, pd.DataFrame([location], columns=header)],
                ignore_index=True)
    csv.to_csv(outputFileName, encoding='"utf-8')
コード例 #11
0
ファイル: datasort.py プロジェクト: ap9430/nltk
def linkclean(csvpath):
    features = []
    csv = pd.read_csv(csvpath)

    mes = csv['message']
    for i in range(len(mes)):
        if "http" in mes[i] or "www" in mes[i]:
            mes[i] = regex.sub("http\S+", "", mes[i])
            mes[i] = regex.sub("www\S+", "", mes[i])
            features.append(1)
        else:
            features.append(0)

    csv['message'] = mes
    csv.to_csv("..\data\linksremoved.csv", index=False)

    return features
コード例 #12
0
def get_by_salary(min, max):
    uri = "https://ciabhackathon.conductor.com.br:8443/transacoes/data/intervalo/2018-05-01/2018-07-01"
    resp = requests.get(uri,
                        headers={
                            'Content-Type':
                            'application/json',
                            'Authorization':
                            'Token a754f3d7bffaf8abc2570d5c354f8f4015e5487e'
                        })

    if resp.status_code != 200:
        print("Deu errado")
    else:
        result = resp.json()['data']
        csv = DataFrame(result)
        export_csv = csv.to_csv(r'file.csv')
コード例 #13
0
def WriteToCSV(csv, row, column, value):
    csv.at[row, column] = value
    csv.to_csv(witchFile + '.csv', index=False)
コード例 #14
0
ファイル: cluster.py プロジェクト: aBITnav/Scheduling
import json
import csv
import pandas as pd

with open('clustered_pools.json') as f:
    clust = json.loads(f.read())
with open('column.json') as f:
    ids = json.loads(f.read())
with open('latlon.json') as f:
    latlon = json.loads(f.read())

if __name__ == '__main__':
    data = [[0] * 5 for i in range(len(ids))]
    k = 0
    for i in range(len(clust)):
        for j in clust[i]:
            data[k][0] = ids[j]
            data[k][1] = j
            data[k][2] = latlon[j][0]
            data[k][3] = latlon[j][1]
            data[k][4] = i + 1
            k = k + 1
    csv = pd.DataFrame(data,
                       columns=['Employee', 'Sl.No', 'Lat', 'Lon', 'Pool'])
    '''with open("clust_analysis.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerows(csv)'''
    csv.to_csv('clust_lsis.csv')
コード例 #15
0
ファイル: main.py プロジェクト: AntoineAugusti/youtubers-tips
        "slug":
        current["slug"],
        "name":
        current["name"],
        "creator_pseudo":
        current["creator"]["pseudo"],
        "categories":
        "|".join([e["name"] for e in current["categories"]]),
        "youtube_url":
        extract_url(current["links"], "youtube"),
        "twitter_url":
        extract_url(current["links"], "twitter"),
        "tip_amount":
        int(current["parameters"]["tipperAmount"]),
        "tip_number":
        int(current["parameters"]["tipperNumber"]),
    })

with open(FILENAME, "a") as f:
    writer = csv.DictWriter(f, data[0].keys(), lineterminator="\n")
    if f.tell() == 0:
        writer.writeheader()
    writer.writerows(data)

csv = pd.read_csv(FILENAME, parse_dates=["date"])

csv.drop_duplicates(subset=["date", "slug"], keep="last", inplace=True)
csv.sort_values(by=["date", "slug"], inplace=True)

csv.to_csv(FILENAME, index=False)
コード例 #16
0
    except:
        continue
        '''
        Caso algo saia errado ou a página chegue
        ao final, o arquivo é gravado
        '''
    print(endereco)
    print('PAGINA: ', pagina)

    pagina -= 1

    if pagina <= 90:
        break

#listass.to_csv('lista.csv')
'''
csv = pd.DataFrame()

csv['titulo'] = lista_titulo
csv['corpo'] = lista_corpo
csv['url'] = lista_url

csv.to_csv('el_pais_full.csv')



'''
s1 = pd.Series(lista_titulo, name='titulo')
s2 = pd.Series(lista_corpo, name='corpo')
s3 = pd.Series(lista_url, name='url')
コード例 #17
0
def save_results(scores,
                 true_labels,
                 model,
                 dataset,
                 method,
                 weight,
                 label,
                 random_seed,
                 anomaly_type,
                 anomaly_proportion,
                 step=-1):

    directory = 'results/{}/{}/{}_{}/{}/w{}/'.format(model, dataset,
                                                     anomaly_type,
                                                     anomaly_proportion,
                                                     method, weight)
    if not os.path.exists(directory):
        os.makedirs(directory)
    print(directory, dataset)
    if dataset != 'kdd':

        print("Tets on ", dataset)
        file_name = str(label) + "_step" + str(step)
        if anomaly_type == 'novelty':
            print("NOVELTY")
            c = 90
            if dataset == 'rop':
                c = 22
        else:
            c = anomaly_proportion * 100

        file_name = "{}_step{}_rd{}".format(label, step, random_seed)
        c = anomaly_proportion * 100

        # Highest 5% are anomalous
        per = np.percentile(scores, 100 - c)
        fname = directory + "{}.csv".format(label)
        csv_file = directory + "scores.csv"
    else:
        file_name = "kdd_step{}_rd{}".format(step, random_seed)
        # Highest 20% are anomalous
        per = np.percentile(scores, 80)
        fname = directory + "results.csv"
        csv_file = directory + "scores.csv"

    scores = np.array(scores)

    csv = pd.DataFrame()
    csv['scores'] = scores
    csv['labels'] = true_labels
    csv.to_csv(csv_file, index=False)

    #try:
    #    scores_norm = (scores-min(scores))/(max(scores)-min(scores))
    #except:
    #    scores_norm = (scores-scores.min())/(scores.max()-scores.min())

    print(max(scores), min(scores))
    roc_auc = do_roc(scores,
                     true_labels,
                     file_name=file_name,
                     directory=directory)
    prc_auc = do_prc(scores,
                     true_labels,
                     file_name=file_name,
                     directory=directory)
    do_cumdist(scores, file_name=file_name, directory=directory)

    prg_auc = 0  #do_prg(scores, true_labels, file_name=file_name, directory=directory)
    '''
    plt.close()

    plt.figure()
    idx_inliers = true_labels == 0
    idx_outliers = true_labels == 1
    hrange = (min(scores), max(scores))
    plt.hist(scores[idx_inliers], 50, facecolor=(0, 1, 0, 0.5),
             label="Normal samples", density=True, range=hrange)
    plt.hist(scores[idx_outliers], 50, facecolor=(1, 0, 0, 0.5),
             label="Anomalous samples", density=True, range=hrange)
    plt.title("Distribution of the anomaly score")
    plt.legend()
    plt.savefig(directory + 'histogram_{}_{}.png'.format(random_seed, dataset),
                transparent=True, bbox_inches='tight')
    '''

    y_pred = (scores >= per)

    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels.astype(int), y_pred.astype(int), average='binary')

    print(
        "Testing at step %i, method %s: Prec = %.4f | Rec = %.4f | F1 = %.4f" %
        (step, method, precision, recall, f1))

    print(
        "Testing method {} | ROC AUC = {:.4f} | PRC AUC = {:.4f} | PRG AUC = {:.4f}"
        .format(method, roc_auc, prc_auc, prg_auc))

    results = [
        model, dataset, anomaly_type, anomaly_proportion, method, weight,
        label, step, roc_auc, prc_auc, prg_auc, precision, recall, f1,
        random_seed,
        time.ctime()
    ]
    save_results_csv("results/results.csv", results, header=0)

    results = [step, roc_auc, prc_auc, precision, recall, f1, random_seed]
    save_results_csv(fname, results, header=0)
コード例 #18
0
ファイル: final_split.py プロジェクト: aBITnav/Scheduling
import pandas as pd

with open('split_pools.json') as f:
    clust=json.loads(f.read())
with open('column.json') as f:
    ids=json.loads(f.read())
with open('latlon.json') as f:
    latlon=json.loads(f.read())

if __name__=='__main__':
    data=[[0]*6 for i in range(len(ids))]
    k=0
    for i in range(len(clust)):
        c=1
        for j in clust[i]:
            data[k][0]=ids[j]
            data[k][1]=j
            data[k][2]=latlon[j][0]
            data[k][3]=latlon[j][1]
            data[k][4]=i+1
            data[k][5]=c
            c=c+1
            k=k+1
    csv=pd.DataFrame(data,columns=['Employee','Sl.No','Lat','Lon','Pool','Order'])
    '''with open("clust_analysis.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerows(csv)'''
    csv.to_csv('split_lsis.csv')
    
        
コード例 #19
0
ファイル: combos.py プロジェクト: kevinhu/shop-and-ship
iter = 1

for i in range(sample_n + 1):
    random_samples.append(random.sample(regions_list, choose))

for i in random_samples:
    print(str(iter) + " / " + str(sample_n))
    #all_or = regions_dict[i[0]] | regions_dict[i[1]] | regions_dict[i[2]] | regions_dict[i[3]] | regions_dict[i[4]]
    all_or = regions_dict[i[0]]
    for j in range(1, choose):
        all_or = all_or | regions_dict[i[j]]
    combos["_".join(str(x) for x in list(i))] = all_or.count("1")
    iter = iter + 1

sorted_combos = sorted(combos.items(),
                       key=operator.itemgetter(1),
                       reverse=True)
# print(dict(sorted_combos))

csv = pd.DataFrame(index=range(sample_n + 1), columns=["regions", "coverage"])
csv["regions"] = list(dict(sorted_combos).keys())
csv["coverage"] = list(dict(sorted_combos).values())
# print(csv)
plt.figure(figsize=(10, 5))
sns.distplot(csv["coverage"], bins=100, rug=False)
plt.savefig("coverage_figures/t" + str(top_num) + "c" + str(choose) + "n" +
            str(sample_n) + ".png")
csv.to_csv("coverage_data/t" + str(top_num) + "c" + str(choose) + "n" +
           str(sample_n) + ".csv",
           sep=',')
コード例 #20
0
def addLink():
    csv = pd.read_csv('../result/info.csv')
    print(len(csv))
    csv['link'] = csv.apply(lambda x: getLinkNumber(x['idx']), axis=1)
    csv.to_csv('../result/info1.csv', index=False)
コード例 #21
0
ファイル: FSGLM.py プロジェクト: AndrewJSchoen/FSGLM
def writeCSV(csv, csvfilepath):
  csv.to_csv(csvfilepath)
コード例 #22
0
def delIndex(id):
    csv = pd.read_csv('../result/idcsv/%d.csv' % id, index_col=0)
    csv.to_csv('../result/idcsv/%d.csv' % id, index=False)
コード例 #23
0
ファイル: run.py プロジェクト: GielDerks/case_day_dash_app
                                  ' ',
                                  '', ' ', ' ', ' ', ' ', ' ',
                                  ' ',
                                  ' ', ' ', '', ' ',
                                  ' '])
            if count == g + 25:
                break

            count += 1
            pass

csv_file.close()
csv_file4.close()


csv = pd.read_csv(start_from_name, delimiter=';')

# csv.columns = ['BEDRIJF_INPUT', 'VESTIGING', 'bagId', 'city', 'country', 'gpslat',
#                'gpslon', 'housenumber', 'houseNumberAddition','postalCode', 'rijksdriehoekX', 'rijksdriehoekY', 'rijksdriehoekZ',
#                'street', 'type', 'branchNumber', 'isMainSbi', 'sbiCode', 'sbiCodeDescription', 'employees',
#                'foundationDate', 'hasCommercialActivities', 'hasEntryInBusinessRegister', 'hasNonMailingIndication',
#                'isBranch', 'isLegalPerson', 'isMainBranch', 'kvkNumber', 'legalForm', 'registrationDate', 'rsin',
#                'businessName', 'currentStatutoryNames', 'currentTradeNames', 'shortBusinessName', 'fuzzy_match_score',
#                'matched_company_name']

start_from_name = '/Users/gielderks/Downloads/Code/Final_excel/profilersinput/ouput_' + start_from + '_till_' + str(count) + '.csv'
csv.to_csv(start_from_name, index=False)


# pprint.pprint(companies)
コード例 #24
0
def urlID(id):
    global db
    csv = pd.read_csv('../result/csv/%d.csv' % id)
    csv['urlID'] = csv.apply(lambda x: getID(x.url), axis=1)
    csv.to_csv('../result/idcsv/%d.csv' % id)
コード例 #25
0
ファイル: task2.py プロジェクト: nemo-tj/biendata
def sub_task2_file():
	validation = pd.read_csv(PM.task2_validation_csv,header=0,index_col='author_name')
	txt = pd.read_csv(PM.task2_target,sep='\t',header=0,index_col='authorname')
	csv = txt.loc[validation.index].fillna(method='bfill')
	csv.index.name = 'authorname'
	csv.to_csv(PM.sub_task2,sep='\t',header=1,index=True)
コード例 #26
0
ファイル: Crawler.py プロジェクト: Kaioiva/CrawMobi
    def menu(self):

        cel1 = Celular.celular()
        cel2 = Celular.celular()
        compara = Comparador.comparador()

        # Criando arquivo xls para escrita dos dados dos handsets
        workbook = xlsxwriter.Workbook('Dados_Smartphones.xlsx')
        worksheet = workbook.add_worksheet()

        # Criação dos metadados da tabela
        worksheet.write(0, 0, 'Marca')
        worksheet.write(0, 1, 'Modelo')
        worksheet.write(0, 2, 'Capacidade da Bateria (mAh)')
        worksheet.write(0, 3, 'Memória RAM (GB)')
        worksheet.write(0, 4, 'Memória de Armazenamento (GB)')
        worksheet.write(0, 5, 'Bluetooth')
        worksheet.write(0, 6, 'NFC')
        worksheet.write(0, 7, 'Dual Chip')
        worksheet.write(0, 8, 'LTE (4G)')
        worksheet.write(0, 9, 'Resolução da Câmera (Mpx)')
        worksheet.write(0, 10, 'Peso (g)')
        worksheet.write(0, 11, 'Dimensões')
        worksheet.write(0, 12, 'Tamanho da Tela (")')
        worksheet.write(0, 13, 'Sistema Operacional')
        worksheet.write(0, 14, 'Versão SO')
        worksheet.write(0, 15, 'Processamento (GHz)')
        worksheet.write(0, 16, 'Link fonte')
        worksheet.write(0, 17, 'Data de atualização')
        worksheet.write(0, 18, 'Ano do lançamento')
        worksheet.write(0, 19, 'Preço (R$)')
        worksheet.write(0, 20, 'Avaliação do Site')
        worksheet.write(0, 21, 'Avaliação dos Usuários')

        # Criação do arquivo contento os logs da aplicação
        log = open('logs.txt', 'w')
        log.write('Smartphones Description - Getting Database\n\n')
        log.write('Execution logs:\n\n')
        log.close()

        book = xlrd.open_workbook("ListaSmartphones.xls")
        sh = book.sheet_by_index(0)
        lista = []

        for rx in range(sh.nrows):
            lista.append(sh.row(rx))
        else:
            for linha, value in enumerate(lista):
                #print(linha, value)

                try:
                    # Pegando o conteúdo da lista de smartphones para pesquisa no site
                    aparelho = str(lista[linha + 1]).split("'")[1].upper()

                    kim = Kimovil.kimovil(aparelho)

                    pha = PhoneArena.phoneArena(aparelho)

                    cel1 = kim.executa()
                    cel2 = pha.executa()

                    compara.armazena(cel1, cel2, linha, worksheet)
                except Exception:
                    print("NOT FOUND")

        log = open('logs.txt', 'a')
        log.write(
            'A maioria dos dados escolhidos foram do site Kimovil por possuir uma base de dados mais extensa.\n\n'
        )
        log.close()

        csvfile = "handsets.csv"
        f = open(
            csvfile,
            'wb')  # Abre o arquivo para escrita apagando o conteúdo existente
        csv = pd.read_excel('Dados_Smartphones.xlsx')
        csv.to_csv('handsets.csv', index=False)

        # Código para criação do arquivo zip
        with ZipFile('gettingDatabase.zip', 'w') as myzip:
            myzip.write('handsets.csv')
            myzip.write('logs.txt')
コード例 #27
0
def sortSeg(id):
    print("sorting %d" % id)
    csv = pd.read_csv('../result/segg/seg%d.csv' % id)
    csv.sort_values('times', inplace=True, ascending=False)
    csv.to_csv("../result/seg/%d.csv" % id, index=False)
コード例 #28
0
    except:

        continue

    print(endereco)
    print('PAGINA: ', pagina)
    
    pagina-=1

    if pagina <= 0:
        break


csv = pd.DataFrame(lista)

csv.to_csv('lista.csv')
'''
csv = pd.DataFrame()

csv['titulo'] = lista_titulo
csv['corpo'] = lista_corpo
csv['url'] = lista_url

csv.to_csv('el_pais_full.csv')

s1 = pd.Series(lista_titulo, name='titulo')
s2 = pd.Series(lista_corpo, name='corpo')
s3 = pd.Series(lista_url, name='url')

s1.to_csv('s1_internacional.csv')
s2.to_csv('s2_internacional.csv')