Ejemplo n.º 1
0
def submission(df, df_test, test_passenger_ids):
  output, forest = train_decision_tree(df[0::,1::], df[0::,0], df_test)
  csv = pd.concat([pd.DataFrame(test_passenger_ids), pd.DataFrame({'Survived': output.astype(int)})], axis=1)

  print forest.feature_importances_
  
  csv.to_csv("submission.csv", index = False)
def removestop(csvpath, stop):
    csv = pd.read_csv(csvpath)
    mes = csv['message']
    cl = csv['Coding:Level1']
    cidx = []
    iidx = []
    aidx = []
    for i in range(len(mes)):
        # for i in range(1):
        if not isinstance(mes[i], str):
            if math.isnan(mes[i]):
                mes[i] = ""
        mes[i].strip()
        m = nl.tokenize.word_tokenize(mes[i])
        sentence = []
        for w in m:
            w = w.lower()
            if w not in stop:
                sentence.append(w)
        mes[i] = sentence
        if cl[i] == 'Information':
            iidx.append(i)
        elif cl[i] == 'Community':
            cidx.append(i)
        elif cl[i] == 'Action':
            aidx.append(i)
        # print(sentence)

    for i in range(len(cl)):
        csv['message'] = mes
    csv.to_csv("data/stopsremoved.csv", index=False)

    return mes, cl, cidx, iidx, aidx
Ejemplo n.º 3
0
def add_value_averages(length, mapped_dict, mfd):
    if 'CSVs' in os.listdir(os.getcwd()):
        os.chdir('CSVs')

    for file in os.listdir(os.getcwd()):
        if file.startswith(length):
            csv = pd.read_csv(file)

    for key, value in mapped_dict.items():
        word_list = []
        for word in value:
            if word in mfd:
                word.strip()
                all_word = word + " (All)"
                if all_word in csv.columns.values:
                    word_list.append(all_word)
                elif word in csv.columns.values:
                    word_list.append(word)
        if word_list:
            csv[key] = csv[word_list].mean(axis=1)
            print(key)
            print(csv[key])

    file_name = length + "-MFD-with-averages.csv"
    csv.to_csv(file_name, index=False)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("path", type=str, help="Dataset path (*.csv)")
    parser.add_argument("destino_path", type=str, help="Path to save the file (*.csv)")
    args = parser.parse_args()
    
    path = args.path
    destino_path = args.destino_path
    #read csv  
    csv = pd.read_csv(path)
    #csv = clean_csv(csv).clean(600,'not', 'not', confi = None)
    #csv = clean_csv(csv).clean(600,'yes', 'not', confi = None)
    csv = clean_csv(csv).clean(700,'yes', 'yes',3, confi = 0.65)
    csv = csv.reset_index(drop=True)
    
    #preprocessing each list
    p1 = Preprocessing()
    lista_query = p1.preprocess_text(csv['query'].tolist())
    lista_response = p1.preprocess_text([str(i) for i in csv['response'].tolist()])
    
    #delete query and response columns
    csv.drop(['query', 'response'], axis=1, inplace=True)
    #add query and response columns
    csv['query'] = lista_query
    csv['response'] = lista_response
    csv = csv.reset_index(drop=True)
    #save csv file
    csv.to_csv(destino_path + 'dataset_confidence_065__3.csv', index=False)
Ejemplo n.º 5
0
 def saveCSV(self, filename, directory):
     os.chdir(directory)
     csv = self.csv[0]
     csv.to_csv(
         filename,
         encoding='utf-8',
     )
def calculate_rating(line):
    csv = pd.read_csv('ratings.csv', sep=',')
    row = csv.loc[line]
    print(row)
    curr_rating = (float(row[2]) * 5 + float(row[3]) * 4 + float(row[4]) * 3 +
                   float(row[5]) * 2 + float(row[6]) * 1) / float(row[1])
    print(round(curr_rating, 2))
    return round(curr_rating, 2)
    csv.loc[line, 'Rating'] = curr_rating
    csv.to_csv("ratings.csv", index=False)
def removelinks(csvpath):
    csv = pd.read_csv(csvpath)

    mes = csv['message']
    for i in range(len(mes)):
        if "http" in mes[i] or "www" in mes[i]:
            mes[i] = regex.sub("http\S+", "", mes[i])
            mes[i] = regex.sub("www\S+", "", mes[i])

    csv['message'] = mes
    csv.to_csv("data/linksremoved.csv", index=False)
Ejemplo n.º 8
0
def main():
    query = input('Introduce query: ')
    num_items = check_query(query)
    action = input(
        f'The query returned {num_items} results. Do you want to continue? [y/n] '
    )
    if 'y' in action or 'Y' in action:
        csv = get_csv(num_items, query)
        ts = int(time.time())
        csv.to_csv(f'sample_data/{ts}.csv')
        print(f'Data saved to {ts}.csv')
    else:
        print('Aborting...')
Ejemplo n.º 9
0
def process_csv(filename, outfile):
    csv = pd.read_csv(filename, names=['timeStamp', 'rawData'])
    csv.drop(csv.index[0], inplace=True)
    csv.reset_index(inplace=True)

    csv.drop(['index'], axis=1, inplace=True)
    csv['timeStamp'] -= csv['timeStamp'][0]
    csv['rawData'] = csv['rawData'] - float((csv['rawData'].mode()))

    ## Add endTimeStamp ##
    # tmp = csv['timeStamp'].shift(-1).fillna(0)
    # csv['endTimeStamp'] = tmp
    # csv = csv[:-1]
    # csv['offset'] = csv['endTimeStamp'] - csv['timeStamp']
    csv.to_csv(outfile, sep=',', index=False)
Ejemplo n.º 10
0
def writeLatLngToCSV(files: dict):
    # 日付ごとに output
    # ファイルがなければ追加
    # my map では lat lng でプロットするため、それらとタイトルと日付等のメタデータを行にする
    outputFileName = './output/{0}.csv'.format(date.today())
    if not os.path.exists(outputFileName):
        touch(outputFileName)

    csv = pd.read_csv(outputFileName, sep='\t', names=header)

    for _, locations in files.items():
        for location in locations:
            csv = pd.concat(
                [csv, pd.DataFrame([location], columns=header)],
                ignore_index=True)
    csv.to_csv(outputFileName, encoding='"utf-8')
Ejemplo n.º 11
0
def linkclean(csvpath):
    features = []
    csv = pd.read_csv(csvpath)

    mes = csv['message']
    for i in range(len(mes)):
        if "http" in mes[i] or "www" in mes[i]:
            mes[i] = regex.sub("http\S+", "", mes[i])
            mes[i] = regex.sub("www\S+", "", mes[i])
            features.append(1)
        else:
            features.append(0)

    csv['message'] = mes
    csv.to_csv("..\data\linksremoved.csv", index=False)

    return features
Ejemplo n.º 12
0
def get_by_salary(min, max):
    uri = "https://ciabhackathon.conductor.com.br:8443/transacoes/data/intervalo/2018-05-01/2018-07-01"
    resp = requests.get(uri,
                        headers={
                            'Content-Type':
                            'application/json',
                            'Authorization':
                            'Token a754f3d7bffaf8abc2570d5c354f8f4015e5487e'
                        })

    if resp.status_code != 200:
        print("Deu errado")
    else:
        result = resp.json()['data']
        csv = DataFrame(result)
        export_csv = csv.to_csv(r'file.csv')
Ejemplo n.º 13
0
def WriteToCSV(csv, row, column, value):
    csv.at[row, column] = value
    csv.to_csv(witchFile + '.csv', index=False)
Ejemplo n.º 14
0
import json
import csv
import pandas as pd

with open('clustered_pools.json') as f:
    clust = json.loads(f.read())
with open('column.json') as f:
    ids = json.loads(f.read())
with open('latlon.json') as f:
    latlon = json.loads(f.read())

if __name__ == '__main__':
    data = [[0] * 5 for i in range(len(ids))]
    k = 0
    for i in range(len(clust)):
        for j in clust[i]:
            data[k][0] = ids[j]
            data[k][1] = j
            data[k][2] = latlon[j][0]
            data[k][3] = latlon[j][1]
            data[k][4] = i + 1
            k = k + 1
    csv = pd.DataFrame(data,
                       columns=['Employee', 'Sl.No', 'Lat', 'Lon', 'Pool'])
    '''with open("clust_analysis.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerows(csv)'''
    csv.to_csv('clust_lsis.csv')
Ejemplo n.º 15
0
        "slug":
        current["slug"],
        "name":
        current["name"],
        "creator_pseudo":
        current["creator"]["pseudo"],
        "categories":
        "|".join([e["name"] for e in current["categories"]]),
        "youtube_url":
        extract_url(current["links"], "youtube"),
        "twitter_url":
        extract_url(current["links"], "twitter"),
        "tip_amount":
        int(current["parameters"]["tipperAmount"]),
        "tip_number":
        int(current["parameters"]["tipperNumber"]),
    })

with open(FILENAME, "a") as f:
    writer = csv.DictWriter(f, data[0].keys(), lineterminator="\n")
    if f.tell() == 0:
        writer.writeheader()
    writer.writerows(data)

csv = pd.read_csv(FILENAME, parse_dates=["date"])

csv.drop_duplicates(subset=["date", "slug"], keep="last", inplace=True)
csv.sort_values(by=["date", "slug"], inplace=True)

csv.to_csv(FILENAME, index=False)
Ejemplo n.º 16
0
    except:
        continue
        '''
        Caso algo saia errado ou a página chegue
        ao final, o arquivo é gravado
        '''
    print(endereco)
    print('PAGINA: ', pagina)

    pagina -= 1

    if pagina <= 90:
        break

#listass.to_csv('lista.csv')
'''
csv = pd.DataFrame()

csv['titulo'] = lista_titulo
csv['corpo'] = lista_corpo
csv['url'] = lista_url

csv.to_csv('el_pais_full.csv')



'''
s1 = pd.Series(lista_titulo, name='titulo')
s2 = pd.Series(lista_corpo, name='corpo')
s3 = pd.Series(lista_url, name='url')
Ejemplo n.º 17
0
def save_results(scores,
                 true_labels,
                 model,
                 dataset,
                 method,
                 weight,
                 label,
                 random_seed,
                 anomaly_type,
                 anomaly_proportion,
                 step=-1):

    directory = 'results/{}/{}/{}_{}/{}/w{}/'.format(model, dataset,
                                                     anomaly_type,
                                                     anomaly_proportion,
                                                     method, weight)
    if not os.path.exists(directory):
        os.makedirs(directory)
    print(directory, dataset)
    if dataset != 'kdd':

        print("Tets on ", dataset)
        file_name = str(label) + "_step" + str(step)
        if anomaly_type == 'novelty':
            print("NOVELTY")
            c = 90
            if dataset == 'rop':
                c = 22
        else:
            c = anomaly_proportion * 100

        file_name = "{}_step{}_rd{}".format(label, step, random_seed)
        c = anomaly_proportion * 100

        # Highest 5% are anomalous
        per = np.percentile(scores, 100 - c)
        fname = directory + "{}.csv".format(label)
        csv_file = directory + "scores.csv"
    else:
        file_name = "kdd_step{}_rd{}".format(step, random_seed)
        # Highest 20% are anomalous
        per = np.percentile(scores, 80)
        fname = directory + "results.csv"
        csv_file = directory + "scores.csv"

    scores = np.array(scores)

    csv = pd.DataFrame()
    csv['scores'] = scores
    csv['labels'] = true_labels
    csv.to_csv(csv_file, index=False)

    #try:
    #    scores_norm = (scores-min(scores))/(max(scores)-min(scores))
    #except:
    #    scores_norm = (scores-scores.min())/(scores.max()-scores.min())

    print(max(scores), min(scores))
    roc_auc = do_roc(scores,
                     true_labels,
                     file_name=file_name,
                     directory=directory)
    prc_auc = do_prc(scores,
                     true_labels,
                     file_name=file_name,
                     directory=directory)
    do_cumdist(scores, file_name=file_name, directory=directory)

    prg_auc = 0  #do_prg(scores, true_labels, file_name=file_name, directory=directory)
    '''
    plt.close()

    plt.figure()
    idx_inliers = true_labels == 0
    idx_outliers = true_labels == 1
    hrange = (min(scores), max(scores))
    plt.hist(scores[idx_inliers], 50, facecolor=(0, 1, 0, 0.5),
             label="Normal samples", density=True, range=hrange)
    plt.hist(scores[idx_outliers], 50, facecolor=(1, 0, 0, 0.5),
             label="Anomalous samples", density=True, range=hrange)
    plt.title("Distribution of the anomaly score")
    plt.legend()
    plt.savefig(directory + 'histogram_{}_{}.png'.format(random_seed, dataset),
                transparent=True, bbox_inches='tight')
    '''

    y_pred = (scores >= per)

    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels.astype(int), y_pred.astype(int), average='binary')

    print(
        "Testing at step %i, method %s: Prec = %.4f | Rec = %.4f | F1 = %.4f" %
        (step, method, precision, recall, f1))

    print(
        "Testing method {} | ROC AUC = {:.4f} | PRC AUC = {:.4f} | PRG AUC = {:.4f}"
        .format(method, roc_auc, prc_auc, prg_auc))

    results = [
        model, dataset, anomaly_type, anomaly_proportion, method, weight,
        label, step, roc_auc, prc_auc, prg_auc, precision, recall, f1,
        random_seed,
        time.ctime()
    ]
    save_results_csv("results/results.csv", results, header=0)

    results = [step, roc_auc, prc_auc, precision, recall, f1, random_seed]
    save_results_csv(fname, results, header=0)
Ejemplo n.º 18
0
import pandas as pd

with open('split_pools.json') as f:
    clust=json.loads(f.read())
with open('column.json') as f:
    ids=json.loads(f.read())
with open('latlon.json') as f:
    latlon=json.loads(f.read())

if __name__=='__main__':
    data=[[0]*6 for i in range(len(ids))]
    k=0
    for i in range(len(clust)):
        c=1
        for j in clust[i]:
            data[k][0]=ids[j]
            data[k][1]=j
            data[k][2]=latlon[j][0]
            data[k][3]=latlon[j][1]
            data[k][4]=i+1
            data[k][5]=c
            c=c+1
            k=k+1
    csv=pd.DataFrame(data,columns=['Employee','Sl.No','Lat','Lon','Pool','Order'])
    '''with open("clust_analysis.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerows(csv)'''
    csv.to_csv('split_lsis.csv')
    
        
Ejemplo n.º 19
0
iter = 1

for i in range(sample_n + 1):
    random_samples.append(random.sample(regions_list, choose))

for i in random_samples:
    print(str(iter) + " / " + str(sample_n))
    #all_or = regions_dict[i[0]] | regions_dict[i[1]] | regions_dict[i[2]] | regions_dict[i[3]] | regions_dict[i[4]]
    all_or = regions_dict[i[0]]
    for j in range(1, choose):
        all_or = all_or | regions_dict[i[j]]
    combos["_".join(str(x) for x in list(i))] = all_or.count("1")
    iter = iter + 1

sorted_combos = sorted(combos.items(),
                       key=operator.itemgetter(1),
                       reverse=True)
# print(dict(sorted_combos))

csv = pd.DataFrame(index=range(sample_n + 1), columns=["regions", "coverage"])
csv["regions"] = list(dict(sorted_combos).keys())
csv["coverage"] = list(dict(sorted_combos).values())
# print(csv)
plt.figure(figsize=(10, 5))
sns.distplot(csv["coverage"], bins=100, rug=False)
plt.savefig("coverage_figures/t" + str(top_num) + "c" + str(choose) + "n" +
            str(sample_n) + ".png")
csv.to_csv("coverage_data/t" + str(top_num) + "c" + str(choose) + "n" +
           str(sample_n) + ".csv",
           sep=',')
Ejemplo n.º 20
0
def addLink():
    csv = pd.read_csv('../result/info.csv')
    print(len(csv))
    csv['link'] = csv.apply(lambda x: getLinkNumber(x['idx']), axis=1)
    csv.to_csv('../result/info1.csv', index=False)
Ejemplo n.º 21
0
def writeCSV(csv, csvfilepath):
  csv.to_csv(csvfilepath)
Ejemplo n.º 22
0
def delIndex(id):
    csv = pd.read_csv('../result/idcsv/%d.csv' % id, index_col=0)
    csv.to_csv('../result/idcsv/%d.csv' % id, index=False)
Ejemplo n.º 23
0
                                  ' ',
                                  '', ' ', ' ', ' ', ' ', ' ',
                                  ' ',
                                  ' ', ' ', '', ' ',
                                  ' '])
            if count == g + 25:
                break

            count += 1
            pass

csv_file.close()
csv_file4.close()


csv = pd.read_csv(start_from_name, delimiter=';')

# csv.columns = ['BEDRIJF_INPUT', 'VESTIGING', 'bagId', 'city', 'country', 'gpslat',
#                'gpslon', 'housenumber', 'houseNumberAddition','postalCode', 'rijksdriehoekX', 'rijksdriehoekY', 'rijksdriehoekZ',
#                'street', 'type', 'branchNumber', 'isMainSbi', 'sbiCode', 'sbiCodeDescription', 'employees',
#                'foundationDate', 'hasCommercialActivities', 'hasEntryInBusinessRegister', 'hasNonMailingIndication',
#                'isBranch', 'isLegalPerson', 'isMainBranch', 'kvkNumber', 'legalForm', 'registrationDate', 'rsin',
#                'businessName', 'currentStatutoryNames', 'currentTradeNames', 'shortBusinessName', 'fuzzy_match_score',
#                'matched_company_name']

start_from_name = '/Users/gielderks/Downloads/Code/Final_excel/profilersinput/ouput_' + start_from + '_till_' + str(count) + '.csv'
csv.to_csv(start_from_name, index=False)


# pprint.pprint(companies)
Ejemplo n.º 24
0
def urlID(id):
    global db
    csv = pd.read_csv('../result/csv/%d.csv' % id)
    csv['urlID'] = csv.apply(lambda x: getID(x.url), axis=1)
    csv.to_csv('../result/idcsv/%d.csv' % id)
Ejemplo n.º 25
0
def sub_task2_file():
	validation = pd.read_csv(PM.task2_validation_csv,header=0,index_col='author_name')
	txt = pd.read_csv(PM.task2_target,sep='\t',header=0,index_col='authorname')
	csv = txt.loc[validation.index].fillna(method='bfill')
	csv.index.name = 'authorname'
	csv.to_csv(PM.sub_task2,sep='\t',header=1,index=True)
Ejemplo n.º 26
0
    def menu(self):

        cel1 = Celular.celular()
        cel2 = Celular.celular()
        compara = Comparador.comparador()

        # Criando arquivo xls para escrita dos dados dos handsets
        workbook = xlsxwriter.Workbook('Dados_Smartphones.xlsx')
        worksheet = workbook.add_worksheet()

        # Criação dos metadados da tabela
        worksheet.write(0, 0, 'Marca')
        worksheet.write(0, 1, 'Modelo')
        worksheet.write(0, 2, 'Capacidade da Bateria (mAh)')
        worksheet.write(0, 3, 'Memória RAM (GB)')
        worksheet.write(0, 4, 'Memória de Armazenamento (GB)')
        worksheet.write(0, 5, 'Bluetooth')
        worksheet.write(0, 6, 'NFC')
        worksheet.write(0, 7, 'Dual Chip')
        worksheet.write(0, 8, 'LTE (4G)')
        worksheet.write(0, 9, 'Resolução da Câmera (Mpx)')
        worksheet.write(0, 10, 'Peso (g)')
        worksheet.write(0, 11, 'Dimensões')
        worksheet.write(0, 12, 'Tamanho da Tela (")')
        worksheet.write(0, 13, 'Sistema Operacional')
        worksheet.write(0, 14, 'Versão SO')
        worksheet.write(0, 15, 'Processamento (GHz)')
        worksheet.write(0, 16, 'Link fonte')
        worksheet.write(0, 17, 'Data de atualização')
        worksheet.write(0, 18, 'Ano do lançamento')
        worksheet.write(0, 19, 'Preço (R$)')
        worksheet.write(0, 20, 'Avaliação do Site')
        worksheet.write(0, 21, 'Avaliação dos Usuários')

        # Criação do arquivo contento os logs da aplicação
        log = open('logs.txt', 'w')
        log.write('Smartphones Description - Getting Database\n\n')
        log.write('Execution logs:\n\n')
        log.close()

        book = xlrd.open_workbook("ListaSmartphones.xls")
        sh = book.sheet_by_index(0)
        lista = []

        for rx in range(sh.nrows):
            lista.append(sh.row(rx))
        else:
            for linha, value in enumerate(lista):
                #print(linha, value)

                try:
                    # Pegando o conteúdo da lista de smartphones para pesquisa no site
                    aparelho = str(lista[linha + 1]).split("'")[1].upper()

                    kim = Kimovil.kimovil(aparelho)

                    pha = PhoneArena.phoneArena(aparelho)

                    cel1 = kim.executa()
                    cel2 = pha.executa()

                    compara.armazena(cel1, cel2, linha, worksheet)
                except Exception:
                    print("NOT FOUND")

        log = open('logs.txt', 'a')
        log.write(
            'A maioria dos dados escolhidos foram do site Kimovil por possuir uma base de dados mais extensa.\n\n'
        )
        log.close()

        csvfile = "handsets.csv"
        f = open(
            csvfile,
            'wb')  # Abre o arquivo para escrita apagando o conteúdo existente
        csv = pd.read_excel('Dados_Smartphones.xlsx')
        csv.to_csv('handsets.csv', index=False)

        # Código para criação do arquivo zip
        with ZipFile('gettingDatabase.zip', 'w') as myzip:
            myzip.write('handsets.csv')
            myzip.write('logs.txt')
Ejemplo n.º 27
0
def sortSeg(id):
    print("sorting %d" % id)
    csv = pd.read_csv('../result/segg/seg%d.csv' % id)
    csv.sort_values('times', inplace=True, ascending=False)
    csv.to_csv("../result/seg/%d.csv" % id, index=False)
    except:

        continue

    print(endereco)
    print('PAGINA: ', pagina)
    
    pagina-=1

    if pagina <= 0:
        break


csv = pd.DataFrame(lista)

csv.to_csv('lista.csv')
'''
csv = pd.DataFrame()

csv['titulo'] = lista_titulo
csv['corpo'] = lista_corpo
csv['url'] = lista_url

csv.to_csv('el_pais_full.csv')

s1 = pd.Series(lista_titulo, name='titulo')
s2 = pd.Series(lista_corpo, name='corpo')
s3 = pd.Series(lista_url, name='url')

s1.to_csv('s1_internacional.csv')
s2.to_csv('s2_internacional.csv')