Example #1
0
    def test_to_csv_compression(self, compression):

        s = Series([0.123456, 0.234567, 0.567567],
                   index=['A', 'B', 'C'],
                   name='X')

        with ensure_clean() as filename:

            s.to_csv(filename, compression=compression, header=True)

            # test the round trip - to_csv -> read_csv
            rs = pd.read_csv(filename,
                             compression=compression,
                             index_col=0,
                             squeeze=True)
            assert_series_equal(s, rs)

            # explicitly ensure file was compressed
            f = tm.decompress_file(filename, compression=compression)
            text = f.read().decode('utf8')
            assert s.name in text
            f.close()

            f = tm.decompress_file(filename, compression=compression)
            assert_series_equal(s, pd.read_csv(f, index_col=0, squeeze=True))
            f.close()
Example #2
0
def get_id(sec):
    '''Scrape the internal HTML ID for the film'''
    global sec_to_id
    if sec_to_id.empty:
        try:
            sec_to_id = Series.from_csv('hsx_security_to_id', header=0)
        except:
            print('Security -> ID table not found, making a new one')
            with open('hsx_security_to_id', 'w') as f:
                f.write('security,id')
            sec_to_id = Series.from_csv('hsx_security_to_id', header=0)
    if sec not in sec_to_id:
        r = requests.get('http://www.hsx.com/security/view/{}'.format(sec))
        #extract from webpage
        soup = BeautifulSoup(r.text)
        try:
            script = soup.findAll('script')[4].text.split('\n')
            sec_id = script[3].split('=')[2]
            sec_id = sec_id.split('"')[0]
        except:
            print("Cannot find id for {}".format(sec))
            return -1
        sec_to_id[sec] = int(sec_id)
        Series.to_csv(sec_to_id,'hsx_security_to_id',header='security,id')
    return sec_to_id[sec]
Example #3
0
def fastq_length_plot(fastq, plotname, writename):
    '''
	fastq length plot
	'''
    dict_length = defaultdict(int)
    with open(fastq) as handle:
        for record in SeqIO.parse(handle, "fastq"):
            l = len(record.seq)
            dict_length[l] += 1

        for i in range(0, 150):
            if i in dict_length.keys():
                pass
            else:
                dict_length[i] = 0

    df = Series(dict_length)
    df = df.sort_index()
    df.to_csv(writename, header=False, sep="\t", float_format="%.0f")
    df.plot(kind="bar", color="#990000", fontsize=15, width=1)
    plt.xlim(-1, 61)
    plt.xticks(range(0, 61, 10), ("0", "10", "20", "30", "40", "50", "60"),
               rotation=0)
    plt.savefig(plotname, bbox_inches='tight')
    plt.close()
def setup():
    global count_matrix
    global translations
    x = 0
    if isfile(setup_status_file):
        x = int(open(setup_status_file).read())
    if x < 1:
        sys.stdout.write('Creating Database')
        create_db()
        sys.stdout.write('\rCreating Movie Table')
        create_table_movies()
        seed_table_movies()
        with open(setup_status_file, "w") as out:
            out.write('1')
    if x < 2:
        sys.stdout.write('\rLoading Data')
        data = read_csv(movie_data_file + '.csv', delimiter=',')
        sys.stdout.write('\rFormatting Data')
        data['soup'] = data.apply(create_soup, axis=1)
        sys.stdout.write('\rCalculating Counts')
        count_matrix = CountVectorizer().fit_transform(data['soup'])
        save_npz(dataframe_file, count_matrix)
        sys.stdout.write('\rCalculating Indices     ')
        id_to_index = Series(data.index, index=data['id'])
        id_to_index.to_csv(translations_file, encoding='utf-8', header=True)
        sys.stdout.write('\r                        ')
        with open(setup_status_file, "w") as out:
            out.write('2')
    count_matrix = load_npz(dataframe_file)
    translations = read_csv(translations_file)
    sys.stdout.write('\rSetup Complete\n')
Example #5
0
    def generate(cls, intents: list, dir_path: str):
        path = Path(dir_path)

        cls.wordDict.reset()

        intents_df = cls.intents.generateIntentDataFrame(intents)
        classes_s = cls.intents.generateIntentClasses(intents)

        intents_df.to_csv(str(path / 'intents_df.csv'), index=False)

        keys = cls.intents.getKeys()
        for value in intents_df[keys['patterns']].values:
            cls.wordDict.load(value.split(' '))

        wordDict_df = cls.wordDict.getDataFrame()
        wordDict_df.to_csv(path / 'wordDict_df.csv', index=False)

        transformedWordDict_s = Encoder.transformSeries(wordDict_df['words'],
                                                        as_is=False,
                                                        set_series=True)
        transformedWordDict_s.to_csv(path / 'transformedWordDict_s.csv',
                                     index=False)

        train_x_df = Encoder.encode(intents_df['patterns'],
                                    transformedWordDict_s)
        train_x_df.to_csv(path / 'train_x_df.csv', index=False)

        train_y_df = Encoder.oneHotEncode(intents_df['tag'])
        train_y_df.to_csv(path / 'train_y_df.csv', index=False)

        classes_s = Series(train_y_df.columns)
        classes_s.to_csv(path / 'classes_s.csv', index=False)
Example #6
0
def get_id(sec):
    '''Scrape the internal HTML ID for the film'''
    global sec_to_id
    if sec_to_id.empty:
        try:
            sec_to_id = Series.from_csv('hsx_security_to_id', header=0)
        except:
            print('Security -> ID table not found, making a new one')
            with open('hsx_security_to_id', 'w') as f:
                f.write('security,id')
            sec_to_id = Series.from_csv('hsx_security_to_id', header=0)
    if sec not in sec_to_id:
        r = requests.get('http://www.hsx.com/security/view/{}'.format(sec))
        #extract from webpage
        soup = BeautifulSoup(r.text)
        try:
            script = soup.findAll('script')[4].text.split('\n')
            sec_id = script[3].split('=')[2]
            sec_id = sec_id.split('"')[0]
        except:
            print("Cannot find id for {}".format(sec))
            return -1
        sec_to_id[sec] = int(sec_id)
        Series.to_csv(sec_to_id, 'hsx_security_to_id', header='security,id')
    return sec_to_id[sec]
def disaster_message_tf_idf(max_workers):
    input = File_manager('preprocessed', 'disasterMessage')
    tf_idf = File_manager('analyzed', 'disasterMessageTFIDF')
    idf = File_manager('analyzed', 'disasterMessageIDF')
    cmpr = idf.compare_version(input.ver)
    new_ver = input.ver.copy()

    if new_ver['disasterMessage'] == '0':
        return
    if idf.ver[cmpr[0]] == new_ver['disasterMessage'] and len(cmpr) == 1:
        return

    new_ver['disasterMessageTFIDF'] = new_ver['disasterMessage']
    del new_ver['disasterMessage']
    tf_idf.update_version(new_ver)
    new_ver['disasterMessageIDF'] = new_ver['disasterMessageTFIDF']
    del new_ver['disasterMessageTFIDF']
    idf.update_version(new_ver)

    preprocessed_data = read_csv(input.path)
    docs = preprocessed_data['tokens']
    tfidfv = TfidfVectorizer(
        lowercase=False, token_pattern=r'(?u)[^┃]+?(?=┃|$)'
        ).fit(docs)
    vocabs = sorted(tfidfv.vocabulary_, key=tfidfv.vocabulary_.get)
    tf_idf_data = DataFrame(tfidfv.transform(docs).toarray(), columns=vocabs)
    idf_data = Series(tfidfv.idf_, index=vocabs).sort_values()

    tf_idf_data.to_csv(tf_idf.path, index=False)
    idf_data.to_csv(idf.path, header=False)
def main():
    if os.path.isdir(SRC):
        from pandas import Series
        from natsort import natsorted

        img_list = get_image_paths(SRC, FTYPE)
        fname_luma_list = get_all_lumas(img_list)

        sorted_by_fname = natsorted(fname_luma_list, key=lambda x: x[0])

        if STRIPPATHS is True:
            fl_series = Series(
                {os.path.basename(f): l
                 for f, l in sorted_by_fname})
        else:
            fl_series = Series({f: l for f, l in sorted_by_fname})

        fl_series.to_csv(DST,
                         header=['average_lumninance'],
                         index_label='filename')

    elif os.path.isfile(SRC):
        print(average_luma(SRC))

    else:
        print("Specify correct arguments")
        sys.exit()
Example #9
0
def main():
    try:
        path = "/home/longbai/commentdata/UserInfo.txt"
        usercolumns = ['userid', 'useremail', 'anonymous', 'avatar']
        userdfreader = pd.read_csv(path,
                                   header=None,
                                   sep='#V_V#A_A#',
                                   iterator=True,
                                   encoding='utf-8',
                                   names=usercolumns,
                                   engine='python')
        # userdf = pd.read_csv(path, header=None, sep='#V_V#A_A#', encoding='utf-8', names=usercolumns, engine='python')
        goon = True
        c = Series()
        total = 0
        while goon:
            try:
                infochunk = userdfreader.get_chunk(1000000)
                total += 1
                if not infochunk.empty:
                    emailseries = infochunk['useremail'].value_counts()
                    repeatemail = emailseries[emailseries > 1]
                    if not repeatemail.empty():
                        c = c + repeatemail
                print('repeat email length is {}, total={}'.format(
                    len(c), total))
            except StopIteration:
                print('..........over............')
                goon = False
        print('.....save file start.........')
        c.to_csv('useremail.csv', encoding='utf-8', mode='a')
        print('.....save file over.........')
    except Exception as e:
        print(e)
Example #10
0
def save_sets(X_train: pd.DataFrame,
              X_val: pd.DataFrame,
              y_train: pd.Series,
              y_val: pd.Series,
              X_test: pd.DataFrame,
              location,
              suffix: str = '',
              file_type: str = 'csv') -> None:
    """
    
    """
    # Cater for empty suffix and not having a dangling underscore
    if suffix:
        suffix = f'_{suffix}'

    if file_type == 'csv':
        X_train.to_csv(location / f'X_train{suffix}.{file_type}', index=False)
        X_val.to_csv(location / f'X_val{suffix}.{file_type}', index=False)
        y_train.to_csv(location / f'y_train{suffix}.{file_type}', index=False)
        y_val.to_csv(location / f'y_val{suffix}.{file_type}', index=False)
        X_test.to_csv(location / f'X_test{suffix}.{file_type}', index=False)
    elif file_type == 'parquet':
        X_train.to_parquet(location / f'X_train{suffix}.{file_type}',
                           index=False)
        X_val.to_parquet(location / f'X_val{suffix}.{file_type}', index=False)
        y_train.to_parquet(location / f'y_train{suffix}.{file_type}',
                           index=False)
        y_val.to_parquet(location / f'y_val{suffix}.{file_type}', index=False)
        X_test.to_parquet(location / f'X_test{suffix}.{file_type}',
                          index=False)
Example #11
0
def create_vocab(trainqa_path, answerset_path, vocab_path):
    """Create the 4000 vocabulary based on questions in train split.
    3999 most frequent words and 1 <UNK>.

    Args:
        trainqa_path: path to train_qa.json.
        vocab_path: vocabulary file.
    """
    vocab = dict()
    train_qa = pd.read_json(trainqa_path)
    # remove question whose answer is not in answerset
    answerset = pd.read_csv(answerset_path, header=None)[0]
    train_qa = train_qa[train_qa['answer'].isin(answerset)]

    questions = train_qa['question'].values
    for q in questions:
        words = q.rstrip('?').split()
        for word in words:
            if len(word) >= 2:
                vocab[word] = vocab.get(word, 0) + 1
    vocab = Series(vocab)
    vocab.sort_values(ascending=False, inplace=True)
    vocab = DataFrame(vocab.iloc[0:3999])
    vocab.loc['<UNK>'] = [0]
    vocab.to_csv(vocab_path, columns=[], header=False)
Example #12
0
def saveDictionaryToFile(my_dict, file_name):
    '''THis function will write the values of a dictionary into a csv, BUT it will also
    append the mean value as the last row'''
    data = Series(my_dict, index=my_dict.keys())
    mean_value = data.mean()
    data['AVG'] = mean_value
    data.sort_index(axis=0, inplace=True)
    data.to_csv(file_name)
Example #13
0
def count_objects(filename):
    with open(f'in/{filename}.geojson', "r", encoding="utf-8") as f:
        dct = json.load(f)

    s = Series(map(lambda x: x['geometry']['type'], dct['features'])).value_counts()
    path_out = f"out/object_count.csv"
    s.to_csv(path_out)
    return path_out
Example #14
0
def save_preprocessed(
    df: pd.DataFrame, target: pd.Series, dataset_in_path: str, target_in_path: str
) -> None:
    dataset_out_path = get_output_path(dataset_in_path)
    target_out_path = get_output_path(target_in_path)

    df.to_csv(dataset_out_path, index=False)
    target.to_csv(target_out_path, index=False)
Example #15
0
    def test_to_csv_float_format(self):
        with ensure_clean() as filename:
            ser = Series([0.123456, 0.234567, 0.567567])
            ser.to_csv(filename, float_format='%.2f')

            rs = Series.from_csv(filename)
            xp = Series([0.12, 0.23, 0.57])
            assert_series_equal(rs, xp)
Example #16
0
def detect(data, args):
    in_file = data['r2_path']
    out_prefix = data['sample_id']
    out_file = out_prefix + "_polyA.dat.gz"
    out_name_false = out_prefix + "_none.dat.gz"
    counts = Counter()
    num_line = 0
    logger.my_logger.info("reading file %s" % in_file)
    logger.my_logger.info("creating files %s %s" % (out_file, out_name_false))
    data['detect'] = out_file
    if os.path.exists(out_file):
        return data
    with file_transaction(out_file) as tx_out_file:
        with open_fastq(in_file) as handle, gzip.open(tx_out_file, 'w') as out, gzip.open(out_name_false, 'w') as out_false:
            for line in handle:
                #print line
                num_line += 1
                if num_line % 1000000 == 0:
                    logger.my_logger.info("read %s lines:" % num_line)
                if line.startswith("@HISEQ"):
                    #print line
                    name = line.strip()
                    seq = handle.next().strip()
                    handle.next().strip()
                    qual = handle.next().strip()
                    find = _adapter(seq, qual)
                    #print "%s %s" % (seq, find)
                    if find:
                        seq, qual = find
                        ns = poly_A_percentage(seq)
                        #ns = polyA(seq)
                        if ns:
                            if ns[1]-ns[0] >= 6:
                                #print "positions are" + str(ns[0]) + ".." + str(ns[1])
                                mod = seq[:ns[0]]
                                seq_polyA = seq[ns[0]:ns[1]]
                                seq_gene = seq[ns[1]:]
                                qual_polyA = qual[ns[0]:ns[1]]
                                qual_gene = qual[ns[1]:]
                                #print "%s\t%s\t%s\t%s\t%s\t%s\n" % (name,mod,sf,qf)
                                out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (name, ns[0], ns[1], mod, seq_polyA, qual_polyA, seq_gene, qual_gene))
                                counts['polyA'] += 1
                                if len(mod) > 0:
                                    counts['mod'] += 1
                            else:
                                counts['shortA'] += 1
                                out_false.write("%s\t%s\t%s\t%s\n" % ("shortA", name, seq, qual))
                        else:
                            counts['noA'] += 1
                            out_false.write("%s\t%s\t%s\t%s\n" % ("None", name, seq, qual))
                    else:
                        out_false.write("%s\t%s\t%s\t%s\n" % ("No_tag", name, seq, qual))
                        counts['notag'] += 1
        with file_transaction(out_prefix + ".stat") as tx_stat_file:
            df = Series(counts)
            df.to_csv(tx_stat_file, sep="\t")
        logger.my_logger.info("%s" % counts)
    return data
def predict_lightgbm_model(
    model,
    test_x: pd.DataFrame,
    test_id: pd.Series,
):
    test_pred = model.predict(test_x)
    test_id["SalePrice"] = test_pred
    test_id.to_csv("submition.csv", index=False)
    return test_id
Example #18
0
    def test_to_csv_float_format(self):

        with tm.ensure_clean() as filename:
            ser = Series([0.123456, 0.234567, 0.567567])
            ser.to_csv(filename, float_format="%.2f", header=False)

            rs = self.read_csv(filename)
            xp = Series([0.12, 0.23, 0.57])
            tm.assert_series_equal(rs, xp)
Example #19
0
    def test_to_csv_unicode_index(self):
        buf = StringIO()
        s = Series([u("\u05d0"), "d2"], index=[u("\u05d0"), u("\u05d1")])

        s.to_csv(buf, encoding="UTF-8")
        buf.seek(0)

        s2 = self.read_csv(buf, index_col=0, encoding="UTF-8")
        assert_series_equal(s, s2)
Example #20
0
    def test_to_csv_float_format(self):

        with ensure_clean() as filename:
            ser = Series([0.123456, 0.234567, 0.567567])
            ser.to_csv(filename, float_format="%.2f")

            rs = self.read_csv(filename)
            xp = Series([0.12, 0.23, 0.57])
            assert_series_equal(rs, xp)
Example #21
0
    def test_to_csv_unicode_index(self):
        buf = StringIO()
        s = Series(["\u05d0", "d2"], index=["\u05d0", "\u05d1"])

        s.to_csv(buf, encoding="UTF-8", header=False)
        buf.seek(0)

        s2 = self.read_csv(buf, index_col=0, encoding="UTF-8")
        tm.assert_series_equal(s, s2)
Example #22
0
def load_ipo_info():
    """从网易财经下载个股的IPO数据"""
    cfg = ConfigParser()
    cfg.read('config.ini')
    ipo_info_url = cfg.get('ipo_info', 'ipo_info_url')
    db_path = Path(cfg.get('factor_db', 'db_path'),
                   cfg.get('ipo_info', 'db_path'))
    # 读取所有已上市个股代码
    # data_api = DataApi(addr='tcp://data.tushare.org:8910')
    # data_api.login('13811931480', 'eyJhbGciOiJIUzI1NiJ9.eyJjcmVhdGVfdGltZSI6IjE1MTI4Nzk0NTI2MjkiLCJpc3MiOiJhdXRoMCIsImlkIjoiMTM4MTE5MzE0ODAifQ.I0SXsA1bK--fbGu0B5Is2xdKOjALAeWBJRX6GdVmUL8')
    # df_stock_basics, msg = data_api.query(view='jz.instrumentInfo',
    #                                       fields='status,list_date,name,market',
    #                                       filter='inst_type=1&status=&market=SH,SZ&symbol=',
    #                                       data_format='pandas')
    # if msg != '0,':
    #     print('读取市场个股代码失败。')
    #     return
    # df_stock_basics.symbol = df_stock_basics.symbol.map(lambda x: x.split('.')[0])

    df_stock_basics = Utils.get_stock_basics(all=True)
    # 遍历个股, 下载ipo信息数据
    df_ipo_info = DataFrame()
    for _, stock_info in df_stock_basics.iterrows():
        # 如果个股ipo数据已存在, 则跳过
        if db_path.joinpath('%s.csv' % stock_info.symbol).exists():
            continue

        print('下载%s的IPO数据.' % stock_info.symbol)
        ipo_info_header = []
        ipo_info_data = []

        secu_code = Utils.code_to_symbol(stock_info.symbol)
        url = ipo_info_url % stock_info.symbol[2:]
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'html.parser')
        tags = soup.find_all(name='h2')
        for tag in tags:
            if tag.get_text().strip() == 'IPO资料':
                ipo_table = tag.find_next(name='table')
                for tr in ipo_table.find_all(name='tr'):
                    tds = tr.find_all(name='td')
                    name = tds[0].get_text().replace(' ', '').replace(
                        '\n', '').replace('\r', '')
                    value = tds[1].get_text().replace(' ', '').replace(
                        ',', '').replace('\n', '').replace('\r', '')
                    ipo_info_header.append(name)
                    ipo_info_data.append(value)
                ipo_info = Series(ipo_info_data, index=ipo_info_header)
                ipo_info['代码'] = secu_code
                ipo_info.to_csv(db_path.joinpath('%s.csv' % secu_code))
                df_ipo_info = df_ipo_info.append(ipo_info, ignore_index=True)
                break
    if not df_ipo_info.empty:
        df_ipo_info.to_csv(db_path.joinpath('ipo_info.csv'),
                           index=False,
                           mode='a',
                           header=False)
Example #23
0
def _to_csv(data: pandas.Series, filename: str) -> None:
    LOGGER.info('Writing %s.', filename)
    step = 0.00000001
    data = pandas.DataFrame({
        'Second':
        numpy.arange(0.0, step * len(data), step)[:len(data)],
        'Volt':
        data,
    })
    data.to_csv(filename, index=False)
Example #24
0
File: sara.py Project: nk53/SARA
 def _updateSettingsFile(self, new_settings):
     if isfile(self.settings_file):
         old_settings = Series.from_csv(self.settings_file)
         for setting, value in new_settings.iteritems():
             if type(value) == list:
                 # represent lists as csv encapsulated in quotes
                 value = ','.join(map(str, value))
             old_settings[setting] = value
     else:
         old_settings = Series(new_settings)
     old_settings.to_csv(self.settings_file)
Example #25
0
def write_dict(dict_, key_name, value_name, file_path):

    series = Series(dict_, name=value_name)

    series.index.name = key_name

    if not file_path.endswith(".tsv"):

        file_path += ".tsv"

    series.to_csv(file_path, sep="\t")
Example #26
0
def saveMultipleDictionaryToFile(all_dicts, file_name, names):
    '''THis function will write the values of a dictionary into a csv, BUT it will also
    append the mean value as the last row'''
    for ii, my_dict in enumerate(all_dicts):
        if ii == 0:
            data = Series(my_dict, index=my_dict.keys())

    mean_value = data.mean()
    data['AVG'] = mean_value
    data.sort_index(axis=0, inplace=True)
    data.to_csv(file_name)
Example #27
0
 def _write_results(self, res_df: pd.Series, sim_func, pred_score,
                    lambda_param):
     sim_func = SIMILARITY_DICT.get(sim_func, sim_func)
     dir_path = dp.ensure_dir(
         f'{self.output_dir}/raw/{sim_func}/{self.predictor}/predictions/')
     file_name = f'predictions-{pred_score}+lambda+{lambda_param}'
     res_df.to_csv(path_or_buf=f'{dir_path}/{file_name}',
                   index=True,
                   sep=' ',
                   float_format='%f',
                   header=False)
Example #28
0
def test_logit_regression(results):

    # lib_path = os.popen("pwd").read()[:-1] + "/lib"
    # sys.path.append(lib_path)

    test_data['Survived'] = 1.223
    print test_data

    compared_results = ka.predict(test_data, results, 'Logit') # Use your model to make prediction on our test set. 
    print compared_results
    compared_results = Series(compared_results)                 # convert our model to a series for easy output
    compared_results.to_csv("logitregres.csv")
Example #29
0
    def test_to_csv_interval_index(self):
        # GH 28210
        s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3))

        with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path:
            s.to_csv(path, header=False)
            result = self.read_csv(path, index_col=0, squeeze=True)

            # can't roundtrip intervalindex via read_csv so check string repr (GH 23595)
            expected = s.copy()
            expected.index = expected.index.astype(str)

            tm.assert_series_equal(result, expected)
Example #30
0
def messages_data(soup,message_csv):
    messages = scrape_element(soup, 'messages', '.Message')
    msg_lengths = []
    pd.set_option('display.max_colwidth', -1)
    for k, v in messages.items():
        msg_lengths.append(len(v))
        text = Series(str(np.array(v.encode('utf-8'))))
        print text
        text.to_csv(message_csv, sep=',', header=False, index=False, mode='a')
    df_msg_lgth = DataFrame(msg_lengths)
    df_msg_describe = DataFrame(df_msg_lgth.describe()).T
    cols = df_msg_describe.columns
    df_msg_describe.columns = ['msg_' + c for c in cols]
    return df_msg_describe
Example #31
0
def save_cell_labels(cell_labels: pd.Series,
                     fpath: str,
                     sep: str = '\t') -> None:
    """Save cell labels to plain-text file."""

    if sep == '\t':
        delimited_str = 'tab-delimited'
    elif sep == ',':
        delimited_str = 'comma-delimited'
    else:
        delimited_str = '"%s"-delimited' % sep
    cell_labels.to_csv(fpath, sep='\t')
    _LOGGER.info('Saved labels for %d cells to %d plain-text file.',
                 cell_labels.size, delimited_str)
Example #32
0
def write_dict(dict_, filepath, key_name, value_name):
    """
    Write dictionary as 2 column table.
    :param dict_: dict;
    :param filepath: str;
    :param key_name: str;
    :param value_name: str;
    :return: None
    """

    s = Series(dict_)
    s.index.name = key_name
    s.name = value_name
    s.to_csv(filepath, sep='\t')
Example #33
0
def sperm_RNA(bowtie_out_combined, prefix):
    '''
	get tsRNA, rsRNA, piRNA and profile
	'''
    dic_miR = defaultdict(int)
    dic_miR_iso = defaultdict(int)

    dic_tsRNA = defaultdict(int)
    dic_rsRNA = defaultdict(int)

    dic_piR = defaultdict(int)
    dic_piR_cluster = defaultdict(int)

    with open(bowtie_out_combined) as handle:
        for line in handle:
            seg = line.split()
            count = int(seg[0].split("-")[1])

            ### miRNA
            if re.search("miRNA", seg[2]):
                pass
                #mir = seg[2].split("|")[0]
                #dic_miR[mir]+=count
                ### no 5p isoform
                #if seg[3] == "0":
                #	dic_miR_iso[mir]+=count
            ### tsRNA
            elif re.search("tsRNA", seg[2]):
                tsR = seg[4] + "|" + seg[2]
                dic_tsRNA[tsR] += count
            ### rsRNA
            elif re.search("rsRNA", seg[2]):
                rsR = seg[4] + "|" + seg[2]
                dic_rsRNA[rsR] += count
            ### piRNA
            elif re.search("piRNA", seg[2]):
                dic_piR[seg[4]] += count
                pir_c = seg[2].split("|")[0]
                dic_piR_cluster[pir_c] += count

    ### tsRNA
    #dic_tsRNA = dict(sorted(dic_tsRNA.items(), key=lambda d:d[1], reverse=True))
    tsRNA_out = prefix + ".tsRNA_counts.txt"
    tsRNA_series = Series(dic_tsRNA)
    tsRNA_series.to_csv(tsRNA_out, header=False, sep='\t')

    ### rsRNA
    rsRNA_out = prefix + ".rsRNA_counts.txt"
    rsRNA_series = Series(dic_rsRNA)
    rsRNA_series.to_csv(rsRNA_out, header=False, sep='\t')

    ### piRNA
    piRNA_out = prefix + ".piRNA_seq.txt"
    piRNA_series = Series(dic_piR)
    piRNA_series.to_csv(piRNA_out, header=False, sep='\t')
    piRNA_out2 = prefix + ".piRNA_cluster.txt"
    piRNA_series2 = Series(dic_piR_cluster)
    piRNA_series2.to_csv(piRNA_out2, header=False, sep='\t')
Example #34
0
 def test_to_csv_path_is_none(self):
     # GH 8215
     # Series.to_csv() was returning None, inconsistent with
     # DataFrame.to_csv() which returned string
     s = Series([1, 2, 3])
     csv_str = s.to_csv(path=None)
     assert isinstance(csv_str, str)
Example #35
0
 def test_to_csv_path_is_none(self):
     # GH 8215
     # Series.to_csv() was returning None, inconsistent with
     # DataFrame.to_csv() which returned string
     s = Series([1, 2, 3])
     csv_str = s.to_csv(path_or_buf=None, header=False)
     assert isinstance(csv_str, str)
Example #36
0
    def test_to_csv_from_csv_categorical(self):

        # CSV with categoricals should result in the same output as when one
        # would add a "normal" Series/DataFrame.
        s = Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
        s2 = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
        res = StringIO()
        s.to_csv(res)
        exp = StringIO()
        s2.to_csv(exp)
        self.assertEqual(res.getvalue(), exp.getvalue())

        df = DataFrame({"s": s})
        df2 = DataFrame({"s": s2})
        res = StringIO()
        df.to_csv(res)
        exp = StringIO()
        df2.to_csv(exp)
        self.assertEqual(res.getvalue(), exp.getvalue())
Example #37
0
    def test_to_csv_compression(self, compression):

        s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
                   name='X')

        with ensure_clean() as filename:

            s.to_csv(filename, compression=compression, header=True)

            # test the round trip - to_csv -> read_csv
            rs = pd.read_csv(filename, compression=compression, index_col=0,
                             squeeze=True)
            assert_series_equal(s, rs)

            # explicitly ensure file was compressed
            with tm.decompress_file(filename, compression=compression) as fh:
                text = fh.read().decode('utf8')
                assert s.name in text

            with tm.decompress_file(filename, compression=compression) as fh:
                assert_series_equal(s, pd.read_csv(fh,
                                                   index_col=0, squeeze=True))
Example #38
0
    def test_to_csv_from_csv_categorical(self):

        # CSV with categoricals should result in the same output
        # as when one would add a "normal" Series/DataFrame.
        s = Series(pd.Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
        s2 = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
        res = StringIO()

        s.to_csv(res, header=False)
        exp = StringIO()

        s2.to_csv(exp, header=False)
        assert res.getvalue() == exp.getvalue()

        df = DataFrame({"s": s})
        df2 = DataFrame({"s": s2})

        res = StringIO()
        df.to_csv(res)

        exp = StringIO()
        df2.to_csv(exp)

        assert res.getvalue() == exp.getvalue()
Example #39
0
		else:
			tf.ix[fila,word] = tf.ix[fila,word] + 1
	tf.ix[fila] = tf.ix[fila] / len(tokens)
	fila = fila + 1
	print "Fila: ", fila
#print tf
print "TF MATRIX LISTO"
idf = Series()
#print idf.index
for term in termslist.keys():
	apariciones = termslist[term]
	totaldoc = data.shape[0]
	argumento = totaldoc / (1 + apariciones)
	#print argumento
	test = Series({term : math.log(argumento)})
	idf = idf.add(test, fill_value=0)
#print idf
print "IDF LISTO"
gc.collect()
for i, row in tf.iterrows():
	print i
	tf.ix[i] = row.multiply(idf)
	#gc.collect()

#print tf
#print idf 
#tfidf = tf.apply(lambda x: x.multiply(idf), axis = 1)
#print tfidf
tf.to_csv('tfidf.csv')
idf.to_csv('idf.csv')
                 }

    grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=1, scoring='roc_auc', cv=3)
    grid_search.fit(scaled_X_train, y_train)
    print 'Best score: %.3f'%grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' %(param_name, best_parameters[param_name])

    predictions = grid_search.predict(scaled_X_test)
    print classification_report(y_test, predictions)

    for param_name in parameters.keys():
        clf_args[param_name[5:]] = best_parameters[param_name]

    print 'clf_args:', clf_args

    final_scaler = preprocessing.StandardScaler()
    scaled_final_train_df = final_scaler.fit_transform(final_train_df)
    scaled_final_test_df = final_scaler.transform(final_test_df)

    classifier = RandomForestClassifier(**clf_args)
    classifier.fit(scaled_final_train_df, final_targets_df)
    output = classifier.predict_proba(scaled_final_test_df)
   
    output_probabilities = [round(x[1], 3) for x in output]

    S = Series(output_probabilities, index=Ids)
    S.to_csv('Santander_randomForest_results.csv', header=True, index_label=    ['ID', 'TARGET'])
Example #41
0
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import json
import requests

ranking_url = 'https://itunes.apple.com/jp/rss/topfreeapplications/limit=100/json'
raw_ranking = requests.get(ranking_url).json()
ranking = [{'id': entry['id']['attributes']['im:id'], 'name': entry['im:name']}for entry in raw_ranking['feed']['entry']]

data = []
for element in ranking:
    review_url = 'https://itunes.apple.com/jp/rss/customerreviews/id={0}/json'.format(element['id'])
    raw_reviews = requests.get(review_url).json()
    try:
        reviews = [review['content']['label'] for review in raw_reviews['feed']['entry'][1:50]]
        for review in reviews:
            data.append({'id': element['id'], 'name': element['name'], 'review': review.encode('utf-8')})
    except KeyError:
        continue
data = Series(data)

data.to_csv('reviews.csv')
Example #42
0
import pandas as pd
from pandas import Series, DataFrame
from sklearn.neighbors import KNeighborsClassifier

train_file = pd.read_csv('train.csv')
test_file = pd.read_csv('test.csv')

train_df = DataFrame(train_file)
test_df = DataFrame(test_file)

# make separate data frame for digits and take out of training set
target_df = train_df.label
train_df = train_df.drop(['label'], axis=1)

train_data = train_df.values.astype(np.uint8)
target_data = target_df.values.astype(np.uint8)
test_data = test_df.values.astype(np.uint8)

n_neighbors, weights = 20, 'distance'

clf = KNeighborsClassifier(n_neighbors, weights=weights)
clf.fit(train_data, target_data)

print 'Starting k-neighbors...'

output = clf.predict(test_data)

ImageIds = np.arange(1, 28001)
S = Series(output, index=ImageIds, dtype=np.uint8)
S.to_csv('kNeighbors_results.csv', header=True, index_label=['ImageId', 'Label'])
Example #43
0
print(data.to_csv(sys.stdout, na_rep='NULL'))

print('\n')

print(data.to_csv(sys.stdout, index=False, header=False))

print('\n')

print(data.to_csv(sys.stdout, index=False, col=['a','b','c']))

print('\n')

dates = pd.date_range('1/1/2000', periods=7)
ts = Series(np.arange(7), index=dates)

ts.to_csv('data/tseries.csv')

print('\n')

print(Series.from_csv('data/tseries.csv', parse_dates=True))

print('\n')








Example #44
0
def _1(data: pd.Series) -> AlphaDiversityFormat:
    ff = AlphaDiversityFormat()
    with ff.open() as fh:
        data.to_csv(fh, sep='\t', header=True)
    return ff
Example #45
0
# show that predicted survival probabilities are
# inversely correlated with Age
fig.add_subplot(224, axisbg="#DBDBDB")
plt.scatter(res.predict(),x.Age , alpha=a)
plt.grid(True, linewidth=0.15)
plt.title("The Change of Survival Probability by Age")
plt.xlabel("Predicted chance of survival")
plt.ylabel("Age")


################################
##  Part 4:  RUN ON TEST SET  ##
################################

test_data = pd.read_csv("data/test.csv")

# "Add our independent variable to our test data. (It's usually left
# blank by Kaggle because it's the value you're trying to predict.)"
# 1.23 is just a random value, it could have been anything
test_data['Survived'] = 1.23

# use model to make prediction on test set
compared_results = ka.predict(test_data, results, 'Logit')

# convert model to Series for easy output
compared_results = Series(compared_results)

# output and submit to Kaggle
compared_results.to_csv("data/output/logitregres.csv")
########################################################################
parsed=parse(urlopen('http://nymag.com/daily/intelligencer/2013/04/bloombergs-vip-terminal-tweeters.html'))
doc=parsed.getroot()
links=doc.findall('.//a')
links[15:20]
lnk=links[28]
lnk
lnk.get('href')
lnk.text_content()
urls=[lnk.get('href') for lnk in doc.findall('.//a')]
temp=Series(urls[103:205])

for i in range(0,len(temp)):
    temp[i]=temp[i].replace('//www.twitter.com/','')

temp.to_csv("nymag_tweets.csv")



########################################################################
########################################################################
parsed=parse(urlopen('http://www.businessinsider.com/the-best-finance-people-on-twitter-2012-4?op=1'))
doc=parsed.getroot()
links=doc.findall('.//a')
links[15:20]
lnk=links[28]
lnk
lnk.get('href')
lnk.text_content()
urls=[lnk.get('href') for lnk in doc.findall('.//a')]
str_url='https://twitter.com/#!/'
Example #47
0
# gender and class (again it will be gender + 1 as was in GenderClass
# we also take family size + 1 in order to distinguish men and women that are alone)

train_df['GenderFamilySize'] = (train_df.Gender + 1) * (train_df.FamilySize + 1)
test_df['GenderFamilySize'] = (test_df.Gender + 1) * (test_df.FamilySize + 1)

# passenger 1044 is a 3rd class male 60.5yrs old who embarked at Southampton
# and he is missing his fare value, from data the mean fare for 3rd class 
# man embarking at Southampton is 13.307149

test_df.loc[ (test_df.Fare.isnull()), 'Fare'] = 13.307149

train_df = train_df.drop(['SibSp', 'Parch', 'Embarked', 'GenderFamilySize'], axis=1)
test_df = test_df.drop(['SibSp', 'Parch', 'Embarked', 'GenderFamilySize'], axis=1)

# We are left with columns for 'Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Gender', 'AgeFill', 'GenderClass', 'FamilySize', 'GenderFamilySize'

train_data = train_df.values
test_data = test_df.values

# begin the random forest

forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_data[0::,1::], train_data[0::, 0])
output = forest.predict(test_data)

PassengerIds = np.arange(892, 1310)
S = Series(output, index=PassengerIds, dtype=int)
S.to_csv('titanic_results.csv', header=True, index_label=['PassengerId','Survived'])

                  linewidth=3
                  )
plt.ylabel('Density of Prox1 in Hilus',size=20)
plt.xticks(size = 20, rotation=0)
plt.yticks(size = 14, rotation=0)
# Density Bar Graph  SW and C57

DensityTableSW_C57 = Series([Density[Dictionary['C57_p30']].mean(),Density[Dictionary['SW_p30']].mean(),Density[Dictionary['C57']].mean(),Density[Dictionary['SW']].mean()], index =['C57 P30','SW P30','C57 P60','SW'])
SW_p30_Error = Density[Dictionary['SW_p30']].std()/sqrt(Density[Dictionary['SW_p30']].count())
SW_Error = Density[Dictionary['SW']].std()/sqrt(Density[Dictionary['SW']].count())

plt.figure()   
DensityTableSW_C57.plot(kind='bar',yerr=[C57_p30_Error,SW_p30_Error,C57_Error,SW_Error],color='y')
plt.ylabel('Density of Prox1 in Hilus')
plt.xticks(rotation=0)
DensityTableSW_C57.to_csv('C:\Users\keriambermudez\Dropbox\Figures\Figure4 Strains\New folder\C57_SW_Density.csv')

## PLotting Hilus area Bar Graph

Table_Area_SW_C57 = Series([HilusAreaSum[Dictionary['C57_p16']].mean(),HilusAreaSum[Dictionary['C57_p30']].mean(),HilusAreaSum[Dictionary['SW_p30']].mean(),HilusAreaSum[Dictionary['C57']].mean(),HilusAreaSum[Dictionary['SW']].mean()], index =['C57 P16','C57 P30','SW P30','C57 P60','SW'])
Table_Area_SW_C57_STD = Series([HilusAreaSum[Dictionary['C57_p16']].std(),HilusAreaSum[Dictionary['C57_p30']].std(),HilusAreaSum[Dictionary['SW_p30']].std(),HilusAreaSum[Dictionary['C57']].std(),HilusAreaSum[Dictionary['SW']].std()], index =['C57 P16','C57 P30','SW P30','C57 P60','SW'])
Table_Area_SW_C57_Error = Table_Area_SW_C57_STD/SQRT
plt.figure()   
Table_Area_SW_C57.plot(kind='bar',yerr=TableSW_C57_Error,color='y')
plt.ylabel('Hilus Area')
plt.xticks(rotation=0)

## PLotting Total Cells Bar Graph

Table_Cells_SW_C57 = Series([TotalCellsSum[Dictionary['C57_p16']].mean(),TotalCellsSum[Dictionary['C57_p30']].mean(),TotalCellsSum[Dictionary['SW_p30']].mean(),TotalCellsSum[Dictionary['C57']].mean(),TotalCellsSum[Dictionary['SW']].mean()], index =['C57 P16','C57 P30','SW P30','C57 P60','SW'])
Table_Cells_SW_C57_STD = Series([TotalCellsSum[Dictionary['C57_p16']].std(),TotalCellsSum[Dictionary['C57_p30']].std(),TotalCellsSum[Dictionary['SW_p30']].std(),TotalCellsSum[Dictionary['C57']].std(),TotalCellsSum[Dictionary['SW']].std()], index =['C57 P16','C57 P30','SW P30','C57 P60','SW'])
Example #49
0
# 指定读取的行数
df10 = pd.read_csv('resources/ex5.csv',nrows=10)
# print df10

# 写入csv,缺省值写成NaN,行标签不写入文件,列标签为header
df7.to_csv('resources/write1.csv',sep=',',na_rep='Nan',index=False)

# 按指定顺序写入指定列
df7.to_csv('resources/write1.csv',sep=',',na_rep='Nan',index=False,columns=['b','c','a'])


# Serises的读写
dates1 = pd.date_range('1/1/2000','1/1/2016')
s = Series(dates1,index=np.arange(dates1.size))
# print s
s.to_csv('resources/write2.csv',sep=',')
s1 = Series.from_csv('resources/write2.csv')
# print s1

# csv
f = open('resources/write1.csv')
reader = csv.reader(f)
lines = list(reader)
header,values = lines[0],lines[1:]
data_dic = {
    k:v for k,v in zip(header,zip(*values))
}
# print data_dic

    # Embarked
    s3fa_col = (dfn.Pclass == 3).mul(dfn.Sex == 'female').mul(dfn.Embarked == 'S').mul(dfn.Title > 0)
    s3fa_fn = lambda x: 0.5 if x else -0.5
    s3fa_col = s3fa_col.map(s3fa_fn)
    s3fa_col.name = 'S3FA'
    dfne = pd.concat([dfn, s3fa_col], axis=1)

    # Result
    cols = ['C0', 'C1', 'C2', 'Gender', 'Title', 'S3FA']
    return dfne[cols]


df = pd.read_csv('data/train.csv')
mdf = munge(df)
X = mdf
y = df['Survived']
tuned_parameters = {'penalty': ['l1', 'l2'],
                    'C': np.logspace(-2, 0, 5),
                    'max_iter': np.logspace(2, 3, 5)}
clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=5, n_jobs=4)
clf.fit(X, y)
print(clf.best_estimator_)
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
          % (mean_score, scores.std() / 2, params))
test_df = pd.read_csv('data/test.csv')
res = clf.predict(munge(test_df))
res = Series(res, name='Survived', index=test_df.index)
res = pd.concat([test_df, res], axis=1)[['PassengerId', 'Survived']]
res.to_csv('data/out-1-lr.csv', index=False)
Example #51
0
    return (filepath, trace)

def sim_busy_times(trace, cpus, interval):
    data = {num_cores: trace.cpu.simultaneously_busy_time(num_cores, cpus=list(cpus), interval=INTERVAL) for num_cores in xrange(len(cpus)+1)}
    total_duration = trace.duration if not INTERVAL else INTERVAL.duration
    return Series(data=data.values(), index=data.keys(), name=trace.filename) / total_duration

_files = glob.glob(r'{path}\*{file_ext}'.format(path=PATH, file_ext=FILE_EXT))
F_DICT = {_fp: os.path.split(_fp)[1].split('.')[0] for _fp in _files}

little_idle_dict = defaultdict(list)
big_idle_dict = defaultdict(list)
    
for _file in _files:
    fp, trace = parse_file(_file)

    for cpu in ALL_CPUS:
        for item in trace.cpu.lpm_intervals(cpu=cpu, interval=INTERVAL):
            if item.cpu in BIG_CPUS:
                big_idle_dict[item.state].append(item.interval.duration)
            elif item.cpu in LITTLE_CPUS:
                little_idle_dict[item.state].append(item.interval.duration)
    

for k, v in little_idle_dict.iteritems():
    results = Series(v)*1e6
    results.to_csv(r'{path}\LITTLE_C{idx}.csv'.format(path=PATH, idx=k))

for k, v in big_idle_dict.iteritems():
    results = Series(v)*1e6
    results.to_csv(r'{path}\BIG_C{idx}.csv'.format(path=PATH, idx=k))
if __name__ == "__main__":
    train_dir = '/Users/ray/Downloads/trainResized'
    test_dir = '/Users/ray/Downloads/testResized'
    train_labels_filepath = '/Users/ray/Downloads/trainLabels.csv'

    # download the features per image (train)
    train_features = generate_features(train_dir)

    # download the labels
    train_labels = generate_labels(train_labels_filepath)

    # merge the features with labels
    train_data = pd.merge(left=train_labels, right=train_features, left_on='ID', right_index=True)

    # train the model
    forest_classifier = RandomForestClassifier(n_estimators=100)
    training_input = train_data.ix[:, 2:].values
    target_values = train_data['Class'].apply(lambda x: ord(x)).values
    forest_model = forest_classifier.fit(training_input, target_values)

    # download the features per image (test)
    test_features = generate_features(test_dir)

    # predict the test
    test_labels_raw = forest_model.predict(test_features)
    test_labels = Series(test_labels_raw, index=test_features.index).apply(lambda x: chr(x))

    # build output for the test
    test_labels.name = 'Class'
    test_labels.to_csv('/Users/ray/Downloads/result.csv', index_label='Id', header=True)
#                    'xgb__learning_rate': (0.01, 0.03, 0.05),
#                    'xgb__colsample_bytree': (0.8, 0.85)
#                 }
#
#    grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=1, scoring='roc_auc', cv=3)
#    grid_search.fit(scaled_X_train, y_train)
#    print 'Best score: %.3f'%grid_search.best_score_
#    print 'Best parameters set:'
#    best_parameters = grid_search.best_estimator_.get_params()
#    for param_name in sorted(parameters.keys()):
#        print '\t%s: %r' %(param_name, best_parameters[param_name])
#
#    predictions = grid_search.predict(scaled_X_test)
#    print classification_report(y_test, predictions)
#
#    for param_name in parameters.keys():
#        xgb_args[param_name[5:]] = best_parameters[param_name]
#
#    print 'xgb_args:', xgb_args

    final_scaler = preprocessing.StandardScaler()
    scaled_final_train_df = final_scaler.fit_transform(final_train_df)
    scaled_final_test_df = final_scaler.transform(final_test_df)

    classifier = XGBClassifier(**xgb_args)
    classifier.fit(scaled_final_train_df, final_targets_df)
    output = classifier.predict_proba(scaled_final_test_df)[:,1]

    S = Series(output, index=Ids)
    S.to_csv('Santander_xgboost_results_1.csv', header=True, index_label=['ID', 'TARGET'])
    title_col.name = "Title"
    dfn = pd.concat([df, title_col], axis=1)

    # Embarked
    s3fa_col = (dfn.Pclass == 3).mul(dfn.Sex == "female").mul(dfn.Embarked == "S").mul(dfn.Title > 0)
    s3fa_fn = lambda x: 0.5 if x else -0.5
    s3fa_col = s3fa_col.map(s3fa_fn)
    s3fa_col.name = "S3FA"
    dfne = pd.concat([dfn, s3fa_col], axis=1)

    # Result
    cols = ["C0", "C1", "C2", "Gender", "Title", "S3FA"]
    return dfne[cols]


df = pd.read_csv("../input/train.csv")
mdf = munge(df)
X = mdf
y = df["Survived"]
tuned_parameters = {"penalty": ["l1", "l2"], "C": np.logspace(-2, 0, 5), "max_iter": np.logspace(2, 3, 5)}
clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=5, n_jobs=4)
clf.fit(X, y)
print(clf.best_estimator_)
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))
test_df = pd.read_csv("../input/test.csv")
res = clf.predict(munge(test_df))
res = Series(res, name="Survived", index=test_df.index)
res = pd.concat([test_df, res], axis=1)[["PassengerId", "Survived"]]
res.to_csv("out-1-lr.csv", index=False)
Example #55
0
if not os.path.isfile("coverages.csv") :
    print "compute coverages"
    if 'coverage' in locals() : del coverage
    handle = open(file, "rU")
    for record in tqdm(SeqIO.parse(handle, "fasta")) :
        seq = str(record.seq)
        l = len(seq)
        if 'coverage' not in locals():
            coverage = [0]*l

        for (i,c) in enumerate(seq):
            if c not in ['.','-']:
                coverage[i] = coverage[i] +1
    coverage=Series(coverage)
    coverage.to_csv("coverages.csv",index=False)
    handle.close()
else :
    print "import coverages"
    coverage = Series.from_csv("coverages.csv",header=-1, index_col=False)

print "compute median-ish things"
medians = []
means = []
maxs = []
mins = []
lens = []
left = []
right = []
unsure = []
handle = open(file, "rU")
def main():
    out_dir = os.path.dirname(__file__)

    ex1_path = study.DATA_DIR + '/ch06/ex1.csv'
    cat(ex1_path)

    df = pd.read_csv(ex1_path)
    p(df)
    p(pd.read_table(ex1_path, sep=','))

    p('header less---------------------')
    ex2_path = study.DATA_DIR + '/ch06/ex2.csv'
    cat(ex2_path)
    names = ['a','b', 'c', 'd', 'message']
    p(pd.read_csv(ex2_path, header=None))
    p(pd.read_csv(ex2_path, names=names))
    p(pd.read_csv(ex2_path, names=names, index_col='message'))

    p('hierarchy index---------------------')
    mindex_path = study.DATA_DIR + '/ch06/csv_mindex.csv'
    cat(mindex_path)
    p(pd.read_csv(mindex_path, index_col=['key1', 'key2']))

    p('separate by regex-------------')
    ex3_path = study.DATA_DIR + '/ch06/ex3.csv'
    cat(ex3_path)
    p(pd.read_csv(ex3_path, sep='\s+'))

    p('skip rows-----------')
    ex4_path = study.DATA_DIR + '/ch06/ex4.csv'
    cat(ex4_path)
    p(pd.read_csv(ex4_path, skiprows=[0,2,3]))

    p('N/A------------------')
    ex5_path = study.DATA_DIR + '/ch06/ex5.csv'
    cat(ex5_path)
    result = pd.read_csv(ex5_path)
    p(result)
    p(pd.isnull(result))
    result = pd.read_csv(ex5_path, na_values=['NULL', '12']) # 12 is NA
    p(result)

    p('N/A dict------------------')
    sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
    p(sentinels)
    p(pd.read_csv(ex5_path, na_values=sentinels))

    p('6.1.1 read data chunk size---------------------')
    ex6_path = study.DATA_DIR + '/ch06/ex6.csv'
    p(pd.read_csv(ex6_path).count())
    p(pd.read_csv(ex6_path, nrows=5))
    chunker = pd.read_csv(ex6_path, chunksize=1000)
    p(chunker)
    tot = Series([])
    for piece in chunker:
        tot = tot.add(piece['key'].value_counts(), fill_value=0)
    tot.order(ascending=False)
    p(tot[:10])

    p('6.1.2 write---------------------')
    data = pd.read_csv(ex5_path)
    p(data)

    ex5_out_path = out_dir + '/ex5_out.csv'
    data.to_csv(ex5_out_path)
    cat(ex5_path)

    data.to_csv(sys.stdout, index=False, header=False)
    print ''
    data.to_csv(sys.stdout, index=False, cols=list('abc'))
    print ''

    p('Series--------------')
    tseries_out_path = out_dir + '/tseries_out.csv'
    dates = pd.date_range('1/1/2000', periods=7)
    ts = Series(np.arange(7), index=dates)
    ts.to_csv(tseries_out_path)
    cat(tseries_out_path)
    p(Series.from_csv(tseries_out_path, parse_dates=True))

    p('6.1.3 csv-------------------------')
    ex7_path = study.DATA_DIR + '/ch06/ex7.csv'
    cat(ex7_path)
    f = open(ex7_path)
    reader = csv.reader(f)
    for line in reader:
        print line
    lines = list(csv.reader(open(ex7_path)))
    header, values = lines[0], lines[1:]
    data_dict = {h: v for h,v in zip(header, zip(*values))}
    p(data_dict)

    my_data_out_path = out_dir + '/mydata.csv'
    with open(my_data_out_path, 'w') as fp:
        writer = csv.writer(fp, dialect=my_dialect)
        writer.writerow(('one', 'two', 'three'))
        writer.writerow(('1', '2', '3'))
        writer.writerow(('4', '5', '6'))
        writer.writerow(('7', '8', '9'))
    cat(my_data_out_path)

    p('6.1.4 JSON-------------------------')
    obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
             {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""
    result = json.loads(obj)
    p(result)
    asjson = json.dumps(result)
    p(asjson)
    siblings = DataFrame(result['siblings'], columns=['name', 'age'])
    p(siblings)

    p('6.1.4 XML/HTML Web Scraping-------------------------')
    url = '' #'http://finance.yahoo.com/q/op?s=AAPL+Options'
    if not url is '':
        parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
        doc = parsed.getroot()
        p([lnk.get('href') for lnk in doc.findall('.//a')][-10:])

        tables = doc.findall('.//table')
        p(parse_options_data(tables[9])[:5])
        p(parse_options_data(tables[13])[:5])

    p('6.1.5 Read XML-------------------------')
    xml_path = out_dir + '/Performance_MNR.xml'
    xml_content ="""
<INDICATOR>
    <INDICATOR_SEQ>373889</INDICATOR_SEQ>
    <PARENT_SEQ></PARENT_SEQ>
    <AGENCY_NAME>MEtro-North Railroad</AGENCY_NAME>
    <INDICATOR_NAME>Escalator Availability</INDICATOR_NAME>
    <DESCRIPTION>Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.</DESCRIPTION>
    <PERIOD_YEAR>2011</PERIOD_YEAR>
    <PERIOD_MONTH>12</PERIOD_MONTH>
    <CATEGORY>Service Indicators</CATEGORY>
    <FREQUENCY>M</FREQUENCY>
    <DESIRED_CHANGE>U</DESIRED_CHANGE>
    <INDICATOR_UNIT>%</INDICATOR_UNIT>
    <DECIMAL_PLACES>1</DECIMAL_PLACES>
    <YTD_TARGET>97.00</YTD_TARGET>
    <YTD_ACTUAL></YTD_ACTUAL>
    <MONTHLY_TARGET>97.00</MONTHLY_TARGET>
    <MONTHLY_ACTUAL></MONTHLY_ACTUAL>
</INDICATOR>
"""
    if not os.path.exists(xml_path):
        with open(xml_path, 'w') as f:
            f.write(xml_content)
    parsed = objectify.parse(open(xml_path))
    root = parsed.getroot()
    data = []
    skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',
                   'DESIRED_SEQ', 'DECIMAL_PLACES']
    p(dir(root))
    for elt in root: # .INDICATOR:
        el_data = {}
        for child in elt.getchildren():
            if child.tag in skip_fields:
                continue
            el_data[child.tag] = child.pyval
        data.append(el_data)
    perf = DataFrame(data)
    p(perf)

    tag = '<a href="http://google.com">Google</a>'
    root = objectify.parse(StringIO.StringIO(tag)).getroot()
    p(root)
    p(root.get('href'))
    p(root.text)
          "silent": 1,
          "thread": 1,
          "seed": 1301
          }
num_boost_round = 1000

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.01, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True)

print("Validating")
predict = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(predict))
print('RMSPE: {:.6f}'.format(error))

print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
ytest = gbm.predict(dtest)
sub = Series()
sub = sub.append(Series(np.expm1(ytest), index = test.Id))
sub = sub.append(Series(0, index = closedId))
# Make Submission
sub = pd.DataFrame({"Id": sub.index, "Sales": sub.values})
sub.to_csv("xgboost_submission2.csv", index=False)