Python DataFrame.from_csv Examples, pandas.DataFrame.from_csv Python Examples

Example #1

0

Show file

File: earnings_algo.py Project: pixelm/PythonTrading

def PrintResults(urldata):
    intradata = get_and_proess_data(urldata)

    run_algo = EarningsAlgorithm.Algorithm()

    train_file = df.from_csv("data/intradata_frompython_train.csv", index_col=False, header=0)

    train_file_output = 'data/intradata_frompython_train_results.csv'

    #  # get an instance of the class
    run_algo.train_algo(train_file, train_file_output)

    test_file = df.from_csv("data/intradata_frompython_test.csv", index_col=False, header=0)

    test_file_output = 'data/intradata_frompython_test_results.csv'


    agg_signals, past_5_signals, past_5_values, past_5_bayes = run_algo.test_algo(test_file, test_file_output)


    outputs = []
    outputs.append(str(agg_signals))
    outputs.append(str(past_5_signals))
    outputs.append(str(past_5_values))
    outputs.append(str(past_5_bayes))

Example #2

0

Show file

File: cadence_indexer.py Project: ELVIS-Project/vis-rodan

    def run_my_task(self, inputs, settings, outputs):

        # Get files.
        fermata_indices_file = inputs['Cadence Indexer - fermata indices (Pandas DataFrame csv)'][0]['resource_path']
        infile = inputs['Cadence Indexer - figured bass (Pandas DataFrame csv)'][0]['resource_path']
        outfile = outputs['Cadence Indexer - Pandas DataFrame csv'][0]['resource_path']

        # De-serialize the DataFrames.
        fermata_indices = DataFrame.from_csv(fermata_indices_file, header = [0, 1]) # We know the first two rows constitute a MultiIndex
        figured_bass = DataFrame.from_csv(infile, header = [0, 1]) # We know the first two rows constitute a MultiIndex

        # Added fermatas to DataFrame.
        cadence_marker = fermata_indices.apply(lambda x: 'Fermata' in x.values, axis = 1)
        pieces = {'Basso seguente': figured_bass['Basso seguente']['3'],
                  'Figured bass': figured_bass['Figured bass'].T.loc[['[0,3 1,3 2,3] (3)']].T,
                  'Cadence': cadence_marker}
        figured_bass = concat(pieces, axis = 1)

        # Find cadences.
        marker_column = 'Cadence'
        cadence_size = 4
        indices = figured_bass[figured_bass[marker_column][0] == True].index
        cadences = []
        for index in indices:
            cadenceEndLocation = figured_bass.index.get_loc(index)
            harmonies = []
            for cadenceStep in range(cadenceEndLocation - cadence_size + 1, cadenceEndLocation + 1):
                harmonies.append(figured_bass.iloc[cadenceStep])
            cadence = DataFrame(harmonies)
            cadences.append(cadence)  

        # Output.   
        self.write_cadences_to_file(cadences, outfile)

        return True

Example #3

0

Show file

File: exp_goals_players.py Project: damiengo/saf

def main():
    print("==== START ====")

    dataset = DataFrame.from_csv('../data/stats/shots_teams_2013_2014.tsv', sep='\t', index_col=False)
    current_day = DataFrame.from_csv('../data/stats/shots_players_2015.tsv', sep='\t', index_col=False)

    target = dataset.loc[:,'goal']
    train = dataset.loc[:,['degree', 'distance', 'shot_headed', 'corner']]

    # For using the model
    train_target   = target
    train_features = train

    dataset_test = current_day
    test_target = dataset_test.loc[:,'goal']
    test_features = dataset_test.loc[:,['degree', 'distance', 'shot_headed', 'corner']]

    dataset_test = dataset_test.reset_index(drop=True)
    test_target = test_target.reset_index(drop=True)
    test_features = test_features.reset_index(drop=True)

    model = LogisticRegression()
    model = model.fit(train_features, train_target)

    predicted_probs = model.predict_proba(test_features)
    predicted_goals = DataFrame(predicted_probs[:,1], columns=['predict'])

    results = concat([dataset_test, predicted_goals], axis=1)
    grouped_results = results.groupby(['start', 'name']).sum()
    grouped_results["count"] = results.groupby(['start', 'name']).size()
    grouped_results["ratio"] = grouped_results["predict"]/grouped_results["count"]

    DataFrame(grouped_results).to_csv('../data/stats/exp_goals_players_2015.tsv', sep='\t', encoding='utf-8')

    print("==== END ====")

Example #4

0

Show file

File: test_to_csv.py Project: clwater/lesson_python

    def test_to_csv_from_csv1(self):

        with ensure_clean('__tmp_to_csv_from_csv1__') as path:
            self.frame['A'][:5] = nan

            self.frame.to_csv(path)
            self.frame.to_csv(path, columns=['A', 'B'])
            self.frame.to_csv(path, header=False)
            self.frame.to_csv(path, index=False)

            # test roundtrip
            self.tsframe.to_csv(path)
            recons = DataFrame.from_csv(path)

            assert_frame_equal(self.tsframe, recons)

            self.tsframe.to_csv(path, index_label='index')
            recons = DataFrame.from_csv(path, index_col=None)
            assert(len(recons.columns) == len(self.tsframe.columns) + 1)

            # no index
            self.tsframe.to_csv(path, index=False)
            recons = DataFrame.from_csv(path, index_col=None)
            assert_almost_equal(self.tsframe.values, recons.values)

            # corner case
            dm = DataFrame({'s1': Series(lrange(3), lrange(3)),
                            's2': Series(lrange(2), lrange(2))})
            dm.to_csv(path)
            recons = DataFrame.from_csv(path)
            assert_frame_equal(dm, recons)

Example #5

0

Show file

File: extract_feature_questions_only.py Project: klAndersen/IMT4904_MasterThesis_Code

def __train_on_all_features(filename, up_filename, use_sgd_settings=False):
    csv_file = __get_filename(NEW_PATH, up_filename)
    dataframe = DataFrame.from_csv(csv_file, encoding='utf-8')
    print("Retrieving questions and classification labels...")
    training_data = dataframe[const.QUESTION_TEXT_KEY].copy()
    class_labels = dataframe[const.CLASS_LABEL_KEY].copy()
    print("Starting training of model")
    file = NEW_PATH + "models" + const.SEPARATOR + FILENAME_START + "_" + up_filename + ".pkl"
    model = create_and_save_model(training_data, class_labels, file, predict_proba=True,
                                  test_size=float(0.2), random_state=0, print_results=True,
                                  use_sgd_settings=use_sgd_settings)
    if model is not None:
        pipeline_svm = model.best_estimator_
        # set up the parameter values
        param_svm = [
            {
                'clf__C': [model.best_params_['clf__C']],
                'clf__kernel': [model.best_params_['clf__kernel']],
            },
        ]
        # check if gamma is a part of the parameters
        if model.best_params_.get('clf__gamma') is not None:
            param_svm[0]['clf__gamma'] = [model.best_params_.get('clf__gamma')]
        csv_file = NEW_PATH + const.SEPARATOR + filename + FILE_ENDING
        dataframe = DataFrame.from_csv(csv_file, encoding='utf-8')
        print("Retrieving questions and classification labels...")
        training_data = dataframe[const.QUESTION_TEXT_KEY].copy()
        class_labels = dataframe[const.CLASS_LABEL_KEY].copy()
        print("Starting training of model")
        filename = NEW_PATH + "models" + const.SEPARATOR + filename + ".pkl"
        create_singular_feature_detector_model(pipeline_svm, param_svm, filename, training_data, class_labels,
                                               test_size=float(0.2), random_state=0)

Example #6

0

Show file

File: test_to_csv.py Project: clwater/lesson_python

    def test_to_csv_from_csv2(self):

        with ensure_clean('__tmp_to_csv_from_csv2__') as path:

            # duplicate index
            df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'],
                           columns=['x', 'y', 'z'])
            df.to_csv(path)
            result = DataFrame.from_csv(path)
            assert_frame_equal(result, df)

            midx = MultiIndex.from_tuples(
                [('A', 1, 2), ('A', 1, 2), ('B', 1, 2)])
            df = DataFrame(np.random.randn(3, 3), index=midx,
                           columns=['x', 'y', 'z'])
            df.to_csv(path)
            result = DataFrame.from_csv(path, index_col=[0, 1, 2],
                                        parse_dates=False)
            # TODO from_csv names index ['Unnamed: 1', 'Unnamed: 2'] should it
            # ?
            assert_frame_equal(result, df, check_names=False)

            # column aliases
            col_aliases = Index(['AA', 'X', 'Y', 'Z'])
            self.frame2.to_csv(path, header=col_aliases)
            rs = DataFrame.from_csv(path)
            xp = self.frame2.copy()
            xp.columns = col_aliases

            assert_frame_equal(xp, rs)

            self.assertRaises(ValueError, self.frame2.to_csv, path,
                              header=['AA', 'X'])

Example #7

0

Show file

File: __init__.py Project: moritzbuck/assemblies

def compile_contig_matrices(names = None):
    if names == None:
        fs = sh.find(raw_ass, "-name", "*-smds.coverage.percontig").stdout.split("\n")[:-1]
        fs = [f for f in fs if "_2ref" in f]
    else:
        fs = sh.find(merged_ass + names, "-name", "*-smds.coverage.percontig").stdout.split("\n")[:-1]

    df = DataFrame.from_csv(fs[0],sep="\t")
    glob = df.ix[:,0:2]
    for f in fs:
        if names == None:
            id = [c for c in f.split("/") if "IH" in c][0]
        else:
            id = [c.replace("map_","") for c in f.split("/") if "map_" in c][0]
        values =  DataFrame.from_csv(f,sep="\t")["cov_mean_sample_0"]
        assert sum([a!=b for a,b in zip(values.index, glob.index)]) == 0
        if sum([a!=b for a,b in zip(values.index, glob.index)]) == 0:
            glob[id] = values
        else:
            print f, "is weird"
    if names == None:
        glob.to_csv(stats_out  + "all_contig_coverages.csv",sep="\t")
    else:
        glob.to_csv(stats_out  + names + "_contig_coverages.csv",sep="\t")
        glob[samples].to_csv(stats_out  + self.name + "_contig_coverages_for_concoct.csv",sep="\t")

Example #8

0

Show file

urldata = {}

urldata['q'] = ticker = 'SPY'
urldata['x'] = 'NYSEARCA'
#urldata['x'] = 'NASDAQ'
urldata['i'] = 900
urldata['p'] = '15d'  # number of past trading days
urldata[
    'f'] = 'd,o,h,l,c,v'  # requested data d is time, o is open, c is closing,

intradata = get_and_proess_data(urldata)

run_algo = EarningsAlgorithm.Algorithm()

train_file = df.from_csv("data/intradata_frompython_train.csv",
                         index_col=False,
                         header=0)

train_file_output = 'data/intradata_frompython_train_results.csv'

#  # get an instance of the class
run_algo.train_algo(train_file, train_file_output)

test_file = df.from_csv("data/intradata_frompython_test.csv",
                        index_col=False,
                        header=0)

test_file_output = 'data/intradata_frompython_test_results.csv'

agg_signals, past_5_signals, past_5_values, past_5_bayes = run_algo.test_algo(
    test_file, test_file_output)

Example #9

0

Show file

 def __init__(self):
     self.database = Database(Config(False))
     self.features = DataFrame.from_csv('features.csv')

Example #10

0

Show file

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import plot, ion, show  # interactive ploting
from pandas import DataFrame as df

Source = "~/Documents/repositories/SalesForecast/Source"
ItemKeyName = ['pid', 'size']

items = df.from_csv(path=Source + "/items.csv", sep='|')
prices = df.from_csv(path=Source + "/prices.csv", sep='|')
sales = df.from_csv(path=Source + "/train.csv", sep='|')

# reshape DataFrame
# ~ # ----------------------------------------------------------

# column(s) to index
items.set_index(['size'], append=True, inplace=True)
prices.set_index(['size'], append=True, inplace=True)
prices = prices.transpose()

# convert data type for index
prices = prices.reindex(pd.to_datetime(prices.index))

# cumulatively sold units for each item
# data['sum_Times']=data['Times'].groupby(['userID']).cumsum()    # know about it
sales = sales.sort_values(by=['pid', 'size'])
sales.loc[:, 'cumUnits'] = sales.groupby(ItemKeyName)['units'].cumsum()
sales.set_index(keys=ItemKeyName, append=True, inplace=True)
sales = sales.swaplevel(i='date', j='pid')
sales = sales.swaplevel(i='date', j='size')
""" reshaped items, prices and sales

Example #11

0

Show file

pfam_sim = lambda p,q : float(len(p.pfams.intersection(q.pfams)))/max(len(p.pfams),len(q.pfams)) if max(len(p.pfams),len(q.pfams)) != 0 else None
pfam_simi = {}


for c in tqdm(subset_big_fams.values()):
    for g1 in tqdm(c):
        for g2 in c:
            if not pfam_simi.has_key((g1,g2)):
                pfam_simi[(g1,g2)] = pfam_sim(g1,g2)

for g in tqdm(all_gs):
    if not os.path.exists(pjoin(g.path, "ref")):
        bench.tools['BBMap'].make_index(g)

DataFrame.from_dict({(k[0].name, k[1].name) : {'pfam_simi' : v} for k,v in pfam_simi.items()}).transpose().to_csv("pfam_simis.csv")
ANIs = {}
tt = DataFrame.from_csv("ANIs_fams.csv")
ANIs = {(t[0],t[1][0]) : {'ANI' : t[1][1], 'coverage' : t[1][2]} for t in tt.iterrows()}


for c in tqdm(subset_big_fams.values()):
    to_compute = set()
    for g1 in tqdm(c):
        for g2 in c:
            if not (g1,g2) in to_compute and not (g2,g1) in to_compute and not (g1.name,g2.name) in ANIs.keys() and not (g2.name,g1.name) in ANIs.keys():
                to_compute.add((g1,g2))
    data = Parallel(n_jobs=num_cores)(delayed(single_ANI)(i) for i in tqdm(to_compute))
    ANIs.update(data)
    DataFrame.from_dict(ANIs).transpose().to_csv("ANIs_fams.csv")

Example #12

0

Show file

File: csv.py Project: kob-aha/hydroshare

 def get_dataset(self, *args, **kwargs):
     return DataFrame.from_csv(self.resource.resource_file.path,
                               index_col=None)

Example #13

0

Show file

File: test_to_csv.py Project: zycjss/pandas

    def test_to_csv_multiindex(self):

        frame = self.frame
        old_index = frame.index
        arrays = np.arange(len(old_index) * 2).reshape(2, -1)
        new_index = MultiIndex.from_arrays(arrays, names=['first', 'second'])
        frame.index = new_index

        with ensure_clean('__tmp_to_csv_multiindex__') as path:

            frame.to_csv(path, header=False)
            frame.to_csv(path, columns=['A', 'B'])

            # round trip
            frame.to_csv(path)
            df = DataFrame.from_csv(path, index_col=[0, 1], parse_dates=False)

            # TODO to_csv drops column name
            assert_frame_equal(frame, df, check_names=False)
            self.assertEqual(frame.index.names, df.index.names)

            # needed if setUP becomes a classmethod
            self.frame.index = old_index

            # try multiindex with dates
            tsframe = self.tsframe
            old_index = tsframe.index
            new_index = [old_index, np.arange(len(old_index))]
            tsframe.index = MultiIndex.from_arrays(new_index)

            tsframe.to_csv(path, index_label=['time', 'foo'])
            recons = DataFrame.from_csv(path, index_col=[0, 1])
            # TODO to_csv drops column name
            assert_frame_equal(tsframe, recons, check_names=False)

            # do not load index
            tsframe.to_csv(path)
            recons = DataFrame.from_csv(path, index_col=None)
            self.assertEqual(len(recons.columns), len(tsframe.columns) + 2)

            # no index
            tsframe.to_csv(path, index=False)
            recons = DataFrame.from_csv(path, index_col=None)
            assert_almost_equal(recons.values, self.tsframe.values)

            # needed if setUP becomes classmethod
            self.tsframe.index = old_index

        with ensure_clean('__tmp_to_csv_multiindex__') as path:
            # GH3571, GH1651, GH3141

            def _make_frame(names=None):
                if names is True:
                    names = ['first', 'second']
                return DataFrame(np.random.randint(0, 10, size=(3, 3)),
                                 columns=MultiIndex.from_tuples(
                                     [('bah', 'foo'), ('bah', 'bar'),
                                      ('ban', 'baz')],
                                     names=names),
                                 dtype='int64')

            # column & index are multi-index
            df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path,
                              header=[0, 1, 2, 3],
                              index_col=[0, 1],
                              tupleize_cols=False)
            assert_frame_equal(df, result)

            # column is mi
            df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path,
                              header=[0, 1, 2, 3],
                              index_col=0,
                              tupleize_cols=False)
            assert_frame_equal(df, result)

            # dup column names?
            df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path,
                              header=[0, 1, 2, 3],
                              index_col=[0, 1, 2],
                              tupleize_cols=False)
            assert_frame_equal(df, result)

            # writing with no index
            df = _make_frame()
            df.to_csv(path, tupleize_cols=False, index=False)
            result = read_csv(path, header=[0, 1], tupleize_cols=False)
            assert_frame_equal(df, result)

            # we lose the names here
            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=False, index=False)
            result = read_csv(path, header=[0, 1], tupleize_cols=False)
            self.assertTrue(all([x is None for x in result.columns.names]))
            result.columns.names = df.columns.names
            assert_frame_equal(df, result)

            # tupleize_cols=True and index=False
            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=True, index=False)
            result = read_csv(path,
                              header=0,
                              tupleize_cols=True,
                              index_col=None)
            result.columns = df.columns
            assert_frame_equal(df, result)

            # whatsnew example
            df = _make_frame()
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path,
                              header=[0, 1],
                              index_col=[0],
                              tupleize_cols=False)
            assert_frame_equal(df, result)

            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=False)
            result = read_csv(path,
                              header=[0, 1],
                              index_col=[0],
                              tupleize_cols=False)
            assert_frame_equal(df, result)

            # column & index are multi-index (compatibility)
            df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
            df.to_csv(path, tupleize_cols=True)
            result = read_csv(path,
                              header=0,
                              index_col=[0, 1],
                              tupleize_cols=True)
            result.columns = df.columns
            assert_frame_equal(df, result)

            # invalid options
            df = _make_frame(True)
            df.to_csv(path, tupleize_cols=False)

            for i in [6, 7]:
                msg = 'len of {i}, but only 5 lines in file'.format(i=i)
                with assertRaisesRegexp(ParserError, msg):
                    read_csv(path,
                             tupleize_cols=False,
                             header=lrange(i),
                             index_col=0)

            # write with cols
            with assertRaisesRegexp(TypeError, 'cannot specify cols with a '
                                    'MultiIndex'):
                df.to_csv(path, tupleize_cols=False, columns=['foo', 'bar'])

        with ensure_clean('__tmp_to_csv_multiindex__') as path:
            # empty
            tsframe[:0].to_csv(path)
            recons = DataFrame.from_csv(path)
            exp = tsframe[:0]
            exp.index = []

            tm.assert_index_equal(recons.columns, exp.columns)
            self.assertEqual(len(recons), 0)

Example #14

0

Show file

capitalized_words = r"[A-Z]\w+"
print(re.findall(capitalized_words, sample_twitter))

# Split my_string on spaces and print the result
spaces = r"\s+"
print(re.split(spaces, sample_twitter))

# Find all digits in my_string and print the result
digits = r"\d+"
re.findall(digits, sample_twitter)

from pandas import DataFrame
import pandas as pd

df = DataFrame.from_csv("movie_info.csv", sep=",")

movie_synopsis = df['synopsis']

####First Step is delete missing values in the review column
movie_synopsis_nomissing = [r for r in movie_synopsis if pd.notnull(r)]

####Let use one of the reviews as example
sample = movie_synopsis_nomissing[1494]
sample

###Transfer to lower cases
sample = sample.lower()
sample

###Removing numbers

Example #15

0

Show file

def import_csv(pth):
    y = DataFrame.from_csv(pth, sep='\t', index_col=0)
    y = y.fillna('')
    return y

Example #16

0

Show file

File: topics_scraping.py Project: jzhang45/Kaggle-Web-Scraping

from pandas import DataFrame, Series
import pandas as pd
import os
import re
from selenium import webdriver
#from selenium.webdriver.common.by import By
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.support import expected_conditions as EC
from threading import Thread, BoundedSemaphore
import threading
# from multiprocessing import Pool,cpu_count
# from functools import partial

df_topics = DataFrame(columns=('Competition', 'Topic', 'URL', 'By', 'Views',
                               'Replies', 'Score'))
df_comp_detailed = DataFrame.from_csv('competitions_detailed.csv',
                                      encoding='utf-8')
root = 'https://www.kaggle.com'
topic_index = 0
Topic_Num = 0
#pool_data = BoundedSemaphore(value=8)
pool_html = BoundedSemaphore(value=16)


def get_html(url, file_name):
    global pool_html
    global Topic_Num
    pool_html.acquire()

    #if not os.path.exists(file_name+'.html'):
    #f = open(file_name+'.html','wb')

Example #17

0

Show file

File: extract_geocode_mah.py Project: ashok133/DiWAM-Analytics-

try:
    from bs4 import BeautifulSoup
except ImportError:
    from BeautifulSoup import BeautifulSoup
import time, requests, urllib2, webbrowser, os, csv
import geocoder, math
from pandas import DataFrame

#&markers=color:blue%7Clabel:S%7C40.702147,-74.015794

df = DataFrame.from_csv('analysed_with_zips.csv', sep=',', parse_dates=False)
#df.insert(11,'ZIP','NA')
map_string_list = []
safe_markers = []
unsafe_markers = []
lat_long_list = []
safe_zips = []
unsafe_zips = []
final_string = ''
unsafe_info_window_string = []
safe_info_window_string = []

for index, row in df.iterrows():
    lat = row[8]
    lng = row[9]
    source = row[0]
    label = row[7]

    if lat == 'NA' and lng == 'NA':
        continue

Example #18

0

Show file

File: SentimentAnalyzerAspectsDishList.py Project: mayankkgandhi/DataScience-Portfolio

def main():

	#################################################
	######      LOADING REVIEW DATA           #######
	#################################################

	start_time = time.clock()
	print 'Entering the main thread to start the program'
	data = pd.read_csv('Review_chennai_30000_3_2.csv',sep='|')
	#print data.head()
	revs = data.loc[:,['rest_review', 'r_name','reviewtext', 'date']]
	#print revs.head()
	#print revs.count()
	for i in list(np.where(pd.isnull(revs))):
	    revs.drop(revs.index[i], inplace=True)
	#print revs.count()
	#print type(revs), revs



	

	# clean Ascii Code in the review Text for emoticons and other stuff !!!!
	revs['text'] = revs['reviewtext'].apply(lambda x: x.decode('unicode_escape').\
                                          encode('ascii', 'ignore').\
                                          strip())
	
	revs['r_name'] = revs['r_name'].apply(lambda x: x.decode('unicode_escape').\
                                          encode('ascii', 'ignore').\
                                          strip())

	# clean review text from '|' pipe and |<>:/\@#$%^&*()!~?="'' symbols that are 
	# used as separators in pandas
	revs['cleanedtext'] = revs['text'].apply(clean_reviewtext_symbols)
	revs['cleanedtext_dots'] = revs['cleanedtext'].apply(clean_reviewtext_dots)



	#print 'Cleaned Text \n', revs['cleanedtext']
	#print 'With dots removed \n', revs['cleanedtext_dots']
	reviews_text = revs['cleanedtext_dots'].str.lower()
	reviews_text = reviews_text.values
	res_names = revs['r_name'].values
	print 'RESTAURANT NAMES = \n', res_names
	print res_names[0], res_names[-1]
	score = 1.0
	created_dates = revs['date'].values
	print 'REVIEWED DATES = \n', created_dates
	print created_dates[0], created_dates[-1]
	#print type(reviews_text), reviews_text
	


	#################################################
	###       MODEL LOADING WITHOUT PICKLES       ###    0.5 seconds
	#################################################

	global vect
	global mod
	start_time = time.clock()
	# vect = MyVectorizer(min_df = 2,
 #                          ngram_range=(1,2))

	# # plug _tfidf._idf_diag
	# vect._tfidf._idf_diag = sp.spdiags(idfs,
 #                                         	diags = 0,
 #                                         	m = len(idfs),
 #                                         	n = len(idfs))


	# vocabulary = json.load(open('vocabulary.json', mode = 'rb'))
	# vect.vocabulary_ = vocabulary
	start_time = time.clock()
	with open('Vectorer.pkl', 'rb') as g:
		vect = cPickle.load(g)
	with open('Classifier.pkl', 'rb') as g:
		mod = cPickle.load(g)
	duration = time.clock() - start_time
	print 'Time taken to load the models is ', duration


	#############################################
	###       MODEL LOADING WITH PICKLES      ###    2.5 seconds
	#############################################

	# timeload = time.clock()
	# with open('Vect_cPickle_Ngrams.pkl', 'rb') as f:
	# 	vect = cPickle.load(f)
	# with open('Log_Reg_Model_cPickle_Ngrams.pkl', 'rb') as g:
	# 	mod = cPickle.load(g)
	# print 'Time for cPickle loading of models: ', time.clock() - timeload

	#######################################
	####     DATABASE CREATION        #####
	#######################################

	conn = sqlite3.connect('final_30000_3_2.db')
	print "Opened database successfully"

	# conn.execute('''CREATE TABLE RATINGS
	#        (ID INT PRIMARY KEY     NOT NULL,
	#        RES_NAME       TEXT    NOT NULL,
	#        RES_ID 		  INT  	  NOT NULL,
	#        DISH_NAME      TEXT     NOT NULL,
	#        SENTIMENT      CHAR(10)  NOT NULL)''')
	# print "Table created successfully"


	conn.execute('''CREATE TABLE SA_REVIEW_SCORE
	       (ID INT PRIMARY KEY     	NOT NULL,
	       REVIEW_ID  	  INTEGER 	  	NOT NULL,
	       RES_NAME       TEXT   	NOT NULL,
	       KEYWORD_ID 	  INTEGER  	  	NOT NULL,
	       DISH_NAME      TEXT    	NOT NULL,
	       SCORE 		  REAL 	  	NOT NULL,
	       SENTIMENT      CHAR(10)  NOT NULL,
	       CREATED_DATE   DATE 		NOT NULL)''')
	#print "Table created successfully"

	conn.execute('''CREATE TABLE ASPECTS_SCORE
	       (ID INT PRIMARY KEY     	NOT NULL,
	       REVIEW_ID  	  INTEGER 	  	NOT NULL,
	       RES_NAME       TEXT   	NOT NULL,
	       REVIEW_SENTIMENT 	  TEXT  	  	NOT NULL,
	       SERVICE_SENTIMENT      TEXT   	NOT NULL,
	       VALUE_SENTIMENT 		  TEXT 	  	NOT NULL,
	       AMBIENCE_SENTIMENT     TEXT  NOT NULL,
	       FOOD_SENTIMENT      TEXT  NOT NULL)''')
	print "Table created successfully"

	#######  Used for creating dish_list list in dish_list.py #########
	# dish_df = pd.read_csv('Food Dishes_SA.csv')
	# dish_list = dish_df['Table 1'].values
	# print dish_list
	# dish_list = [x.lower() for x in dish_list]
	# print dish_list
	
	#print dish_list
	# dish_list1 = []
	# dish_list1 = '\t'.join(dish_list)
	text2 = """Sushi is amazingly bad. Service is bad. Noodles is awesome. 
			Interiors were badly made. Nigiri is good. Idli is amazing. Aloo gobi is nice.
			What can i say about Mutton Biriyani? It is bad. The waiters are patient. 
			They are really good."""

	text1 = """Waiters are patient. They are also amazing. Biryani is awesome.
				I would come any day here."""


	servicelist = ['service', 'waiter', 'welcome', 'friendly','staff','waitress','bar tender','bartender','chef','people','steward','stewardess','manners']
	valuelist = ['value','cheap','cost','price','economical','reasonable','budget','pricey','steep','costly']
	ambiencelist = ['place','places','environment','atmosphere','climate','surroundings','look','mood','view','serene','decor','clean','pristine','neat']
	foodlist = ['meal', 'lunch', 'dinner', 'brunch','snacks','cuisine','entree','starters','meals','lunches','brunches','entrees']

	cp = nltk.RegexpParser(grammar)
	dish_counter = 0
	review_index = 0
	nouns_list = []
	ambience_sentiment = '#'
	value_sentiment = '#'
	food_sentiment = '#'
	service_sentiment = '#'
	review_sentiment = '#'

	for review in reviews_text:
		dup_dishes = []
		review_index += 1
		print '########## REVIEW # %d ##############' %(review_index)
		print ' '
		print ' '
		print ' '
		print ' '
		print ' '
		print ' '
		print ' '
		print ' '
		print ' ####################################'
		print "REVIEW = ", review
		##### Removing | symbol in the review text #######
		
		#review = review_cleanup(review, unwanted_elements)
		# review = re.sub('[|]','',review)
		# print 'Cleaned Review = ', review


		#####################################################
		######     FULL REVIEW SENTIMENT PREDICTION    ######
		#####################################################

		REVIEWDATA=StringIO("""Review
				|""" + review)

		df = DataFrame.from_csv(REVIEWDATA, sep="|", parse_dates=False)
		print 'FULL REVIEW SENTIMENT PREDICTION GOES ON .......'
		print df
		review_bow = vect.transform(df['Review'])
		pred_review = mod.predict(review_bow)
		proba_review = mod.predict_proba(review_bow)
		#print review_bow
		print pred_review
		print proba_review
		if str(pred_review[0]) == '1':
			result = 'Positive'
			dict1 = {review : result}
			score = proba_review[0][1]
			print '++++++++++++++++++++'
			print ' The Polarity of the review is  ', result
		else:
			result = 'Negative'
			#dict1 = {sent : result}
			score = proba_review[0][0]
			print '--------------------'
			print ' The Polarity of the review is  ', result

		review_sentiment = result
		print 'Full review sentiment is ', review_sentiment
		##################################################
		######     SENTENCE SENTIMENT PREDICTION    ######
		##################################################

		sentences = nltk.sent_tokenize(review)
		sent_index = 0
		prp_list_index = []
		for sent in sentences:
			sent_index += 1
			#print 'Sentence No: ', sent_index
			tagged = nltk.pos_tag(nltk.word_tokenize(sent))
			#print tagged
			#nouns = [word for word, pos in tagged if pos in ['NN', 'NNP', 'NNS']]
			#print 'Nouns = ', nouns
			#adjectives = [word for word, pos in tagged if pos in ['JJ']]
			#print 'Adjectives = ', adjectives
			#adverbs = [word for word, pos in tagged if pos in ['RB', 'RBS']]
			#print 'Adverbs = ', adverbs
			######## COLLECTING SUCCESSIVE NOUNS ########
			parsed_content = cp.parse(tagged)
			dish1 = re.findall(r'NP\s(.*?)/NN\w*', str(parsed_content))
			dish2 = re.findall(r'NP\s(.*?)/NN\w*\s(.*?)/NN', str(parsed_content))
			dish3 = re.findall(r'NP\s(.*?)/NN\w*\s(.*?)/NN\w*\s(.*?)/NN', str(parsed_content))
			dish4 = re.findall(r'NP\s(.*?)/NN\w*\s(.*?)/NN\w*\s(.*?)/NN\w*\s(.*?)/NN', str(parsed_content))
			print parsed_content
			
			
			nouns = []
			dlist = []

			if len(dish4) != 0:
				for t in dish4:
					noun = ' '.join(item for item in t)
					nouns.append(noun)
					nouns_list.append(noun)

			#print 'Nouns after 1st iteration ', nouns
			dlist = '\t'.join(nouns)
			
			if len(dish3) != 0:
				for t in dish3:
					noun = ' '.join(item for item in t)
					if noun not in dlist:
						nouns.append(noun)
						nouns_list.append(noun)

			#print 'Nouns after 1st iteration ', nouns
			dlist = '\t'.join(nouns)

			if len(dish2) != 0:
				for t in dish2:
					noun = ' '.join(item for item in t)
					if noun not in dlist:
						nouns.append(noun)
						nouns_list.append(noun)
			#print 'Nouns after 2nd iteration ', nouns
			dlist = '\t'.join(nouns)

			if len(dish1) != 0:
				for t in dish1:
					if t not in dlist:
						nouns.append(t)
						nouns_list.append(t)

			print 'Nouns after last iteration ', nouns

			# prps = [word for word, pos in tagged if pos in ['PRP']]
			# print 'PRPs = ', prps

			###
			####
			###
			###
			##### Including NamedEntity
			# namedEnt = nltk.ne_chunk(tagged, binary=True)
			# print 'NamedEnt = ', namedEnt
			#namedEnt.draw()
			
			# for noun in nouns:
			# 	print noun, type(noun)
			flagService = False
			flagDish = False
			flagAmbience = False
			flagValue = False
			flagFood = False
			for noun in nouns:
				ans = getKeyFromDictionary(noun,dish_dict)
				#print 'Printing ans = \n', ans
				if ans and len(ans) > 3:
					flagDish = True
				if noun in servicelist:
					flagService = True
				if noun in ambiencelist:
					flagAmbience = True
				if noun in valuelist:
					flagValue = True
				if noun in foodlist:
					flagFood = True
				else:
					pass

				#print 'Flags : ' , flagAmbience 
			#print 'Flag = ', flag
	
				
			if flagDish:
				
				#print 'predicting sentiment for sentence', sent

				############################
				### PREDICTING SENTIMENT ###
				############################
				test_time = time.clock()
				TESTDATA=StringIO("""Review
	 				|""" + sent)

				df1 = DataFrame.from_csv(TESTDATA, sep="|", parse_dates=False)
				#print df1
				test_time = time.clock()
				test_bow = vect.transform(df1['Review'])
				#print 'Data frame creation time = ', time.clock()-test_time
				
				prediction = mod.predict(test_bow)
				probability = mod.predict_proba(test_bow)
				#print 'Score = ', score , type(score), score[0][1] 
				#print prediction, type(str(prediction[0]))
				if str(prediction[0]) == '1':
					result = 'Positive'
					dict = {sent : result}
					score = probability[0][1]
					#print '++++++++++++++++++++'
					#print ' The Polarity of the review is  ', result
					# print 'Time to predict the review = ', time.clock() - timet
					# return render_template('posres.html', result = dict)
				else:
					result = 'Negative'
					dict = {sent : result}
					score = probability[0][0]
					#print '--------------------'
					#print ' The Polarity of the review is  ', result
					# print 'Time to predict the review = ', time.clock() - timet
					# return render_template('negres.html', result = dict)
				
				#print 'inserting noun,prediction,res_name,res_id,review_id into sqlite3 db'
				#print 'creating a list & jsonify it as output'
				#print 'Keyword %s is %s' % (noun, result)
				#cur.execute("INSERT INTO Contacts VALUES (?, ?, ?, ?);", (firstname, lastname, phone, email))
				
				for noun in nouns:
					ans = getKeyFromDictionary(noun,dish_dict)
					if ans and len(ans) > 3 and ans not in dup_dishes:
						dish_counter += 1
						
						#dishes

						#conn.execute("INSERT INTO RATINGS VALUES (?, ?, ?, ?, ?)", (counter, 'Mark', 25, noun, result))
						conn.execute("INSERT INTO SA_REVIEW_SCORE VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (dish_counter, review_index,res_names[review_index-1], dish_counter, ans, score, result, created_dates[review_index-1]))
						
						print 'Noun = %s & Sentiment = %s ' %(ans, result)
						dup_dishes.append(ans)
					else:
						pass 

				#conn.execute("INSERT INTO RATINGS VALUES (?, ?, ?, ?, ?)", (counter, 'Mark', 25, noun, result))

				conn.commit()
				#print "Records created successfully";
				
			
			if flagService:
				
				#print 'predicting sentiment for sentence', sent

				############################
				### PREDICTING SENTIMENT ###
				############################
				test_time = time.clock()
				TESTDATA=StringIO("""Review
	 				|""" + sent)

				df1 = DataFrame.from_csv(TESTDATA, sep="|", parse_dates=False)
				#print df1
				test_time = time.clock()
				test_bow = vect.transform(df1['Review'])
				#print 'Data frame creation time = ', time.clock()-test_time
				
				prediction = mod.predict(test_bow)
				probability = mod.predict_proba(test_bow)
				#print 'Score = ', score , type(score), score[0][1] 
				#print prediction, type(str(prediction[0]))
				if str(prediction[0]) == '1':
					serviceresult = 'Positive'
					
					#print '++++++++++++++++++++'
					print ' The Polarity of the review is  ', serviceresult
					# print 'Time to predict the review = ', time.clock() - timet
					# return render_template('posres.html', result = dict)
				else:
					serviceresult = 'Negative'
					print ' The Polarity of the review is  ', serviceresult
				
				if serviceresult == 'Positive':
					service_sentiment = '1'
				elif serviceresult == 'Negative':
					service_sentiment = '0'
				else:
					service_sentiment = '#'
					#print '--------------------'
					#print ' The Polarity of the review is  ', result
					# print 'Time to predict the review = ', time.clock() - timet
					# return render_template('negres.html', result = dict)
				
			if flagFood:
				
				#print 'predicting sentiment for sentence', sent

				############################
				### PREDICTING SENTIMENT ###
				############################
				test_time = time.clock()
				TESTDATA=StringIO("""Review
	 				|""" + sent)

				df1 = DataFrame.from_csv(TESTDATA, sep="|", parse_dates=False)
				#print df1
				test_time = time.clock()
				test_bow = vect.transform(df1['Review'])
				#print 'Data frame creation time = ', time.clock()-test_time
				
				prediction = mod.predict(test_bow)
				probability = mod.predict_proba(test_bow)
				#print 'Score = ', score , type(score), score[0][1] 
				#print prediction, type(str(prediction[0]))
				if str(prediction[0]) == '1':
					foodresult = 'Positive'
					
					#print '++++++++++++++++++++'
					#print ' The Polarity of the review is  ', result
					# print 'Time to predict the review = ', time.clock() - timet
					# return render_template('posres.html', result = dict)
				else:
					foodresult = 'Negative'
					
				
				if foodresult == 'Positive':
					food_sentiment = '1'
				elif foodresult == 'Negative':
					food_sentiment = '0'
				else:
					food_sentiment = '#'

					#print '--------------------'
					#print ' The Polarity of the review is  ', result
					# print 'Time to predict the review = ', time.clock() - timet
					# return render_template('negres.html', result = dict)


			if flagAmbience:
				
				print 'Inside Ambience'
				#print 'predicting sentiment for sentence', sent

				############################
				### PREDICTING SENTIMENT ###
				############################
				test_time = time.clock()
				TESTDATA=StringIO("""Review
	 				|""" + sent)

				df1 = DataFrame.from_csv(TESTDATA, sep="|", parse_dates=False)
				#print df1
				test_time = time.clock()
				test_bow = vect.transform(df1['Review'])
				#print 'Data frame creation time = ', time.clock()-test_time
				
				prediction = mod.predict(test_bow)
				probability = mod.predict_proba(test_bow)
				#print 'Score = ', score , type(score), score[0][1] 
				#print prediction, type(str(prediction[0]))
				if str(prediction[0]) == '1':
					ambienceresult = 'Positive'
					
					#print '++++++++++++++++++++'
					#print ' The Polarity of the review is  ', result
					# print 'Time to predict the review = ', time.clock() - timet
					# return render_template('posres.html', result = dict)
				else:
					ambienceresult = 'Negative'
					
				
				if ambienceresult == 'Positive':
					#print 'Positive'
					ambience_sentiment = '1'
					print ambience_sentiment
				elif ambienceresult == 'Negative':
					#print 'Negative'
					ambience_sentiment = '0'
					print ambience_sentiment
				else:
					ambience_sentiment = '#'
					print ambience_sentiment

					#print '--------------------'
					#print ' The Polarity of the review is  ', result
					# print 'Time to predict the review = ', time.clock() - timet
					# return render_template('negres.html', result = dict)

			
			if flagValue:
				
				#print 'predicting sentiment for sentence', sent

				############################
				### PREDICTING SENTIMENT ###
				############################
				test_time = time.clock()
				TESTDATA=StringIO("""Review
	 				|""" + sent)

				df1 = DataFrame.from_csv(TESTDATA, sep="|", parse_dates=False)
				#print df1
				test_time = time.clock()
				test_bow = vect.transform(df1['Review'])
				#print 'Data frame creation time = ', time.clock()-test_time
				
				prediction = mod.predict(test_bow)
				probability = mod.predict_proba(test_bow)
				#print 'Score = ', score , type(score), score[0][1] 
				#print prediction, type(str(prediction[0]))
				if str(prediction[0]) == '1':
					valueresult = 'Positive'
					
					#print '++++++++++++++++++++'
					#print ' The Polarity of the review is  ', result
					# print 'Time to predict the review = ', time.clock() - timet
					# return render_template('posres.html', result = dict)
				else:
					valueresult = 'Negative'
					
				
				if valueresult == 'Positive':
					value_sentiment = '1'
				elif valueresult == 'Negative':
					value_sentiment = '0'
				else:
					value_sentiment = '#'

					#print '--------------------'
					#print ' The Polarity of the review is  ', result
					# print 'Time to predict the review = ', time.clock() - timet
					# return render_template('negres.html', result = dict)

				#print 'inserting noun,prediction,res_name,res_id,review_id into sqlite3 db'
				#print 'creating a list & jsonify it as output'
				#print 'Keyword %s is %s' % (noun, result)
				#cur.execute("INSERT INTO Contacts VALUES (?, ?, ?, ?);", (firstname, lastname, phone, email))
			

				
			if noun in servicelist or noun in ambiencelist or noun in valuelist or noun in foodlist:
			#if noun in ambiencelist:
				dish_counter += 1
				
				#dishes

				#conn.execute("INSERT INTO RATINGS VALUES (?, ?, ?, ?, ?)", (counter, 'Mark', 25, noun, result))
				print 'Inside aspects_score printing module'
				print ' '
				print ' '
				print ambience_sentiment, value_sentiment
				conn.execute("INSERT INTO ASPECTS_SCORE VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (dish_counter, review_index,res_names[review_index-1], review_sentiment, service_sentiment, value_sentiment, ambience_sentiment, food_sentiment))
				
				# print 'Noun = %s & Sentiment = %s ' %(noun, result)
				# dup_dishes.append(noun)
			else:
				pass 

					#conn.execute("INSERT INTO RATINGS VALUES (?, ?, ?, ?, ?)", (counter, 'Mark', 25, noun, result))

			conn.commit()
				#print "Records created successfully";
		


	cursor = conn.execute("SELECT * from SA_REVIEW_SCORE")
	print 'Printing Stored values in SA_REVIEW_SCORE table'
	print "REVIEW_ID \t\t RES_NAME \t\t KEYWORD_ID \t\t DISH_NAME \t\t SCORE \t\t SENTIMENT \t\t CREATED_DATE "
	   
	for row in cursor:
	   
	   # print "REVIEW_ID = ", row[1]
	   # print "RES_NAME = ", row[2]
	   # print "KEYWORD_ID = ", row[3]
	   # print "DISH_NAME = ", row[4]
	   # print "SCORE = %3.2f" % (row[5])
	   # print "SENTIMENT = ", row[6]
	   # print "CREATED_DATE = ", row[7] ,"\n"

	   print "%d \t\t %s \t\t %d \t\t %s \t\t %3.2f \t\t %s \t\t %s" % (row[1],row[2],row[3],row[4],row[5],row[6],row[7])
	



	cursor1 = conn.execute("SELECT * from ASPECTS_SCORE")
	print 'Printing Stored values in ASPECTS_SCORE table'
	print "REVIEW_ID \t\t RES_NAME \t\t REVIEW_SENTIMENT \t\t SERVICE_SENTIMENT \t\t VALUE_SENTIMENT \t\t AMBIENCE_SENTIMENT \t\t FOOD_SENTIMENT "
	   
	for row in cursor1:
	   
	   # print "REVIEW_ID = ", row[1]
	   # print "RES_NAME = ", row[2]
	   # print "KEYWORD_ID = ", row[3]
	   # print "DISH_NAME = ", row[4]
	   # print "SCORE = %3.2f" % (row[5])
	   # print "SENTIMENT = ", row[6]
	   # print "CREATED_DATE = ", row[7] ,"\n"

	   print "%d \t\t %s \t\t %s \t\t %s \t\t %s \t\t %s \t\t %s" % (row[1],row[2],row[3],row[4],row[5],row[6],row[7])
	conn.close()



	print 'Program exiting.....'
	print 'Total time taken: ', time.clock() - start_time, ' seconds'

	return 0

Example #19

0

Show file

redosage = re.compile(r'\d+')
dosage = [int(redosage.findall(name)[0]) for name in names]

# finds 'unite'
reunite = re.compile(r'microgrammes|µg|mg|c|g')
unite = [reunite.findall(name)[0] for name in names]

# finds 'forme'
reforme = re.compile(r'comprimé sécable')
forme = [reforme.findall(name)[0] for name in names]

###############################################################################
# part 2
###############################################################################

# load base
base = DataFrame.from_csv(
    '/home/roms/Telecom/P1/Kit big data/Work/MEDICAM 2008-2013-AMELI clean.csv',
    header=0,
    sep=',')

# medicaments derembourses en 2013
derembourse = base[(base['Montant remboursé 2012'] != "0")
                   & (base['Montant remboursé 2013'] == "0")]['NOM COURT']
derembourse.to_csv('derembourse2013.csv')

# medicaments rembourses en 2013
rembourse = base[(base['Montant remboursé 2012'] == "0")
                 & (base['Montant remboursé 2013'] != "0")]['NOM COURT']
rembourse.to_csv('rembourse2013.csv')

Example #20

0

Show file

import csv
import bisect
import sklearn
from sklearn.externals import joblib
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score
from sklearn.metrics import f1_score, confusion_matrix, cohen_kappa_score
from sklearn import tree
from sklearn import svm, datasets, cross_validation
from sklearn import preprocessing

#imblearn module provides several ways to deal with the imbalance in data
from imblearn.over_sampling import SMOTE

#load the testdata
df_test_data = DataFrame.from_csv("test_data.csv", index_col=False)
df_test_target = DataFrame.from_csv("test_target.csv", index_col=False)

#print the shape of dataframes
print df_test_data.shape
print df_test_target.shape[0]


def test():
    #load the classifier
    clf = joblib.load('model/random_clf.pkl')
    #predict on the test data
    pred = clf.predict(df_test_data)

    #printing the evaluation metrices
    #accuracy can't be used as measure of model hence metrics like recall, precision, kappa are also measured

Example #21

0

Show file

            0] + '_' + pop_size + splitext(codes_hd5_filename)[1]
        idx_hd5_filename = splitext(idx_hd5_filename)[
            0] + '_' + pop_size + splitext(idx_hd5_filename)[1]

    query_args = {'dbname': dbname}
    if args['psql_host'] is not None: query_args['host'] = args['psql_host']
    if args['psql_password'] is not None:
        query_args['password'] = args['psql_password']

    #############
    # Population extraction

    data = None
    if (args['extract_pop'] == 0 | (args['extract_pop'] == 1)) & isfile(
            os.path.join(outPath, static_filename)):
        data = DataFrame.from_csv(os.path.join(outPath, static_filename))
        data = sanitize_df(data, static_data_schema)
        """
        data['admission_type'] = data['admission_type'].astype('category')
        data['gender'] = data['gender'].astype('category')
        data['first_careunit'] = data['first_careunit'].astype('category')
        data['ethnicity'] = data['ethnicity'].astype('category')

        data['intime'] = pd.to_datetime(data['intime']) #, format="%m/%d/%Y"))
        data['outtime'] = pd.to_datetime(data['outtime'])
        data['admittime'] = pd.to_datetime(data['admittime'])
        data['dischtime'] = pd.to_datetime(data['dischtime'])
        data['deathtime'] = pd.to_datetime(data['deathtime'])
        """
    elif (args['extract_pop'] == 1 &
          (not isfile(os.path.join(outPath, static_filename)))) | (

Example #22

0

Show file

File: Utils.py Project: zzygyx9119/Whole_pipelines

def cal_fun(bam_path, bed_file):
    '''
    Using a input bed file and a bam file, cal each pos cov_info from the intervals in bed.

    :param bam_path:
    :param bed_file:
    :return: writing the cov_info to file besides the bam file, with 1-coordinate.
    '''
    bed_file = df.from_csv(bed_file, index_col=False, sep='\t')
    try:
        bamfile = pysam.AlignmentFile(bam_path, 'rb')
    except:
        print("bamefile need to cal doesn't exist")
        raise IOError

    fastafile = pysam.FastaFile(filename=REF_file_path)
    ####define each file path.
    result = 'Gene\tChr\tPosistion\tReference\tbase\tA\tC\tG\tT\tA_Rate\tC_Rate\tG_Rate\tT_Rate\t1\t2\t3\t4\n'
    #define columns.
    for i in range(len(bed_file)):
        # iterator
        Chr = bed_file.iloc[i, 0]
        start = min(int(bed_file.iloc[i, 2]), int(bed_file.iloc[i, 1]))
        end = max(int(bed_file.iloc[i, 2]), int(bed_file.iloc[i, 1]))
        Gene_name = str(bed_file.iloc[i, 3])
        #fetch basic info.
        coverage_ACGT = bamfile.count_coverage(
            Chr, start - 1, end, read_callback='nofilter',
            quality_threshold=0)  # from 1- to 0-
        base_counter = dict(zip(['A', 'C', 'G', 'T'], coverage_ACGT))
        # make it a dict according preset order.
        ref_base_str = fastafile.fetch(Chr, start - 1,
                                       end)  #need to input 0- start/end.

        if sum(sum(j) for j in coverage_ACGT) != 0:
            # if this intervals doesn't have any reads, we ignore it.
            for base_n in range(start, end + 1):
                n_read = int(
                    sum([
                        k[range(start, end + 1).index(base_n)]
                        for k in coverage_ACGT
                    ]))
                # total base num
                if n_read == 0:
                    continue
                    #if position here didn't have any base, then pass this pos.
                result += Gene_name + '\t' + Chr + '\t' + str(
                    base_n) + '\t' + ref_base_str[range(
                        start, end + 1).index(base_n)] + '\t' + '\t'
                for base in ['A', 'C', 'G', 'T']:
                    result += str(base_counter[base][range(
                        start, end + 1).index(base_n)]) + '\t'
                    #write the A/T/G/C num.

                for base in ['A', 'C', 'G', 'T']:
                    result += str(
                        round(
                            float(base_counter[base][range(
                                start, end + 1).index(base_n)]) / n_read,
                            4)) + '\t'
                    # wirte the rate of A/T/C/G RATE
                little_rank_list = [(each_one[1][range(start,
                                                       end + 1).index(base_n)],
                                     each_one[0])
                                    for each_one in base_counter.items()]
                #construct a list to sort A/T/C/G each base num.
                result += '\t'.join([
                    ranked_base[1]
                    for ranked_base in sorted(little_rank_list, reverse=True)
                ]) + '\n'
                #make it columns and tag a '\n' sign.
            if result[-1:] != '\n':
                result += '\n'
            #in case the last base didn't have any base, it will continue, so need to check the last sign.
        else:
            pass
    with open(bam_path.partition('.')[0] + '_cov.info', 'w') as f1:
        f1.write(result)
    print('Cal cov info complete')

Example #23

0

Show file

# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import matplotlib.pyplot as plt

from pandas import DataFrame

df = DataFrame.from_csv("output.dat", sep="\t")
values = np.transpose(df.values)

fig, ax = plt.subplots()
origin = "lower"
pcm = ax.imshow(np.abs(values), cmap='bone_r', origin=origin, aspect="auto")
fig.colorbar(pcm, ax=ax)
plt.show()

Example #24

0

Show file

File: test_to_csv.py Project: zycjss/pandas

        def _do_test(df,
                     r_dtype=None,
                     c_dtype=None,
                     rnlvl=None,
                     cnlvl=None,
                     dupe_col=False):

            kwargs = dict(parse_dates=False)
            if cnlvl:
                if rnlvl is not None:
                    kwargs['index_col'] = lrange(rnlvl)
                kwargs['header'] = lrange(cnlvl)
                with ensure_clean('__tmp_to_csv_moar__') as path:
                    df.to_csv(path,
                              encoding='utf8',
                              chunksize=chunksize,
                              tupleize_cols=False)
                    recons = DataFrame.from_csv(path,
                                                tupleize_cols=False,
                                                **kwargs)
            else:
                kwargs['header'] = 0
                with ensure_clean('__tmp_to_csv_moar__') as path:
                    df.to_csv(path, encoding='utf8', chunksize=chunksize)
                    recons = DataFrame.from_csv(path, **kwargs)

            def _to_uni(x):
                if not isinstance(x, compat.text_type):
                    return x.decode('utf8')
                return x

            if dupe_col:
                # read_Csv disambiguates the columns by
                # labeling them dupe.1,dupe.2, etc'. monkey patch columns
                recons.columns = df.columns
            if rnlvl and not cnlvl:
                delta_lvl = [
                    recons.iloc[:, i].values for i in range(rnlvl - 1)
                ]
                ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
                recons.index = ix
                recons = recons.iloc[:, rnlvl - 1:]

            type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O')
            if r_dtype:
                if r_dtype == 'u':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(_to_uni, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype)
                elif r_dtype == 'dt':  # unicode
                    r_dtype = 'O'
                    recons.index = np.array(lmap(Timestamp, recons.index),
                                            dtype=r_dtype)
                    df.index = np.array(lmap(Timestamp, df.index),
                                        dtype=r_dtype)
                elif r_dtype == 'p':
                    r_dtype = 'O'
                    recons.index = np.array(list(
                        map(Timestamp, to_datetime(recons.index))),
                                            dtype=r_dtype)
                    df.index = np.array(list(
                        map(Timestamp, df.index.to_timestamp())),
                                        dtype=r_dtype)
                else:
                    r_dtype = type_map.get(r_dtype)
                    recons.index = np.array(recons.index, dtype=r_dtype)
                    df.index = np.array(df.index, dtype=r_dtype)
            if c_dtype:
                if c_dtype == 'u':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(_to_uni, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(_to_uni, df.columns),
                                          dtype=c_dtype)
                elif c_dtype == 'dt':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(Timestamp, recons.columns),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(Timestamp, df.columns),
                                          dtype=c_dtype)
                elif c_dtype == 'p':
                    c_dtype = 'O'
                    recons.columns = np.array(lmap(Timestamp,
                                                   to_datetime(
                                                       recons.columns)),
                                              dtype=c_dtype)
                    df.columns = np.array(lmap(Timestamp,
                                               df.columns.to_timestamp()),
                                          dtype=c_dtype)
                else:
                    c_dtype = type_map.get(c_dtype)
                    recons.columns = np.array(recons.columns, dtype=c_dtype)
                    df.columns = np.array(df.columns, dtype=c_dtype)

            assert_frame_equal(df,
                               recons,
                               check_names=False,
                               check_less_precise=True)

Example #25

0

Show file

File: Main.py Project: talionet/PhD-projects

def main(isLoadData=1,
         isCutData=0,
         PieceLength=500,
         isLoadFeatures=1,
         isGetFeaturesNaNs=0,
         isLoadLabels=1,
         LabelFileName={},
         FeatureMethod='Quantization',
         LabelBy='PANSS'):
    # -- TODO:
    # -- #
    # -- # make sure it works for len(FeatureMethod)>1, now only uses FeatureTypeList[0]
    os.system('cls')

    ## Construct / load DATA object
    DataPath = resultsPath + '\\LearningData'
    if isLoadData:
        print('loading DATA from ' + DataPath + '...')
        dataObject = pickle.load(
            open(os.path.join(DataPath, 'DATAraw.pickle'), 'rb')
        )  #TODO change 'raw' to PieceLength variable and make sure it loads the cutted data
    else:
        AllAUs = [
            'TimeStamps', 'TrackingSuccess', 'au1', 'au2', 'au3', 'au4', 'au5',
            'au6', 'au7', 'au8', 'au9', 'au10', 'au11', 'au12', 'au13', 'au14',
            'au15', 'au16', 'au17', 'au18', 'au19', 'au20', 'au21', 'au22',
            'au23', 'au24', 'au25', 'au26', 'au27', 'au28', 'au29', 'au30',
            'au31', 'au32', 'au33', 'au34', 'au35', 'au36', 'au37', 'au38',
            'au39', 'au40', 'au41', 'au42', 'au43', 'au44', 'au45', 'au46',
            'au47', 'au48'
        ]
        GoodTrackableAUs = [
            'au17', 'au18', 'au19', 'au1', 'au22', 'au25', 'au26', 'au27',
            'au28', 'au29', 'au2', 'au30', 'au31', 'au32', 'au33', 'au34',
            'au37', 'au41', 'au43', 'au45', 'au47', 'au48', 'au8'
        ]
        PartNames = 'Interview'
        isQuantize = True
        isCutData = True
        #print('fs-signal: ' + GoodTrackableAUs)
        print('Part: ' + PartNames)
        #print('isQuantize=' + str(isQuantize))
        isSetDataParams = int(raw_input('reset data params? '))
        if isSetDataParams:
            GoodTrackableAUs = raw_input('set fs-signal (as list): ')
            PartNames = raw_input(
                'set Part name (as str, capital first letter): ')
            dataObject = DataObject(PartNames, VarNames=GoodTrackableAUs)
        dataObject.getQuantize()
        pickle.dump(dataObject, open(os.path.join(resultsPath, 'DATA'), 'wb'))

    if isCutData:
        isQuantize = 1  #raw_input('set isQuantize to: ')

        print('constructing Data Object...')
        print('cutting raw data')
        dataObject.rawDF = dataObject.cutData(dataObject.rawDF, PieceLength)
        print('cutting quantized data')
        dataObject.quantizedDF = dataObject.cutData(dataObject.quantizedDF,
                                                    PieceLength)
        isSaveData = 1  #int(raw_input('save cutted data? '))

        if isSaveData:
            saveName = os.path.join(resultsPath, 'LearningData',
                                    'DATA_' + str(PieceLength))
            #dataObject.rawDF.to_csv(saveName+'rawDF.csv')
            #dataObject.quantizedDF.to_csv(saveName+'quantizedDF.csv')
            pickle.dump(dataObject, open(saveName + '.pickle', 'wb'))

    ## Calc / Load FEATURES for learning
    FeaturesPath = resultsPath + '\\LearningFeatures\\' + FeatureMethod + '_Features_' + str(
        PieceLength)
    Features = FeatureObject(dataObject, FeaturesPath, PieceLength)
    if isLoadFeatures:
        print('loading FEATURES from ' + FeaturesPath + '...\n')
        Features.FeaturesDF = read_csv(FeaturesPath + 'DF.csv',
                                       index_col=[0, 1],
                                       skipinitialspace=True,
                                       header=[0, 1])
        Features.method = FeatureMethod
    else:
        if not FeatureMethod:
            FeatureMethod = raw_input(
                "Enter Feature Type ('Quantization', Moments') as list: ")
        print("Calculating subjects' " + FeatureMethod + " features ...")
        Features.getFeatures(FeatureMethod)
    if isGetFeaturesNaNs:
        Features.FeaturesDF = featuresUtils.getMissingFeatures(Features)
        Features.FeaturesDF.to_csv(Features.FeaturesPath + 'DF.csv')

    # Set /Load LABELS for Learning
    LabelsPath = resultsPath + '\\LearningLabels\\' + LabelBy + '_Labels'  #for loading / saving
    LabelsPath2 = LabelsPath + '2'
    if isLoadLabels:
        print('loading LABELS from ' + LabelsPath + '...\n')
        Labels = pickle.load(open(LabelsPath + ".pickle", 'rb'))
        Labels2 = Labels  #pickle.load(open(LabelsPath2+".pickle",'rb')) #todo - change this when there is second labeled data (from michael)
    else:
        Labels = LabelObject(SubjectsDetailsDF, LabelsPath)
        Labels.getLabels(LabelBy)
        SubjectsDetailsDF2 = DF.from_csv(
            'C:\\Users\\taliat01\\Desktop\\TALIA\\Code-Python\\Results\\SubjectsDetailsDF2-fill with data from michael.csv'
        )
        Labels2 = LabelObject(SubjectsDetailsDF2, LabelsPath2)
        Labels2.getLabels(LabelBy)
        #Labels.permLabels() #TODO - move this to "not isLoad" or somewhere else.

    # Get cross validation learning results :
    # loop  over feature number

    FeatureRange = [10]  #range(1,6)#range(1,50,5) #[6]
    #init
    isBoolLabel = Labels.isBoolLabel
    FeatureComparession = {}
    SelectedFeaturesComparession = {}
    newDF = lambda: DF(columns=FeatureRange, index=Labels.names)
    if isBoolLabel:
        All_specificity = newDF()
        All_sensitivity = newDF()
        All_precision = newDF()
        All_accuracy = newDF()
        All_f1 = newDF()
        All_ss_mean = newDF()
    else:
        All_trainR = newDF()
        All_trainPval = newDF()
        All_trainErr = newDF()
        All_testR = newDF()
        All_testPval = newDF()
        All_testErr = newDF()
        All_testErrStd = newDF()
        All_LabelRange = newDF()

    ModelList = ['ridge']  #['regression','ridge','lasso']
    FeatureSelectionList = ['PCA']  #,'KernelPCA','SparsePCA','ICA']
    for m in ModelList:
        print('************************************ Model = ' + m +
              '************************************')
        for fs in FeatureSelectionList:
            print('***************************** FeatureSelection = ' + fs +
                  '******************************')
            for f in FeatureRange:
                print('Num Of Features = ' + str(f))
                s = LearnObject(Features, Labels, Labels2)
                s.run(Model=m,
                      n_features=f,
                      isSavePickle=0,
                      isSaveCsv=1,
                      isSaveFig=1,
                      isPerm=0,
                      isBetweenSubjects=True,
                      FeatureSelection=fs)
                LabelNameList = s.ResultsDF.columns  #TODO - CHANGE THIS!
                for label in LabelNameList:
                    print(label)
                    if f == FeatureRange[0]:
                        FeatureComparession[label] = DF(
                            columns=FeatureRange, index=s.ResultsDF.index)
                        SelectedFeaturesComparession[label] = DF(
                            columns=FeatureRange, index=s.BestFeatures.index)
                    FeatureComparession[label][f] = s.ResultsDF[label]
                    SelectedFeaturesComparession[label][f] = s.BestFeatures[
                        label]
                    r = s.ResultsDF[label]

                    if isBoolLabel:
                        All_specificity[f].loc[label] = r['specificity']
                        All_sensitivity[f].loc[label] = r['sensitivity']
                        All_precision[f].loc[label] = r['precision']
                        All_accuracy[f].loc[label] = r['accuracy']
                        All_f1[f].loc[label] = r['f1']
                        All_ss_mean[f].loc[label] = r['ss_mean']

                    else:
                        All_trainR[f].loc[label] = r['trainR^2']
                        All_trainPval[f].loc[label] = r['trainPval']
                        All_trainErr[f].loc[label] = r['trainError']
                        All_testR[f].loc[label] = r['testR^2']
                        All_testPval[f].loc[label] = r['testPval']
                        All_testErr[f].loc[label] = r['testError']
                        All_testErrStd[f].loc[label] = r['testErrorStd']
                        All_LabelRange[f].loc[label] = r['LabelRange']

        for label in LabelNameList:
            saveName = s.Learningdetails[
                'saveDir'] + '\\' + label + '_ResultsSummary.csv'
            if os.path.exists(saveName):
                isSave = raw_input(
                    'the file ' + saveName +
                    ' already exist, \noverwrite existing file? ')
            else:
                isSave = 1
            if isSave:
                resultsSum = concat([
                    DF(index=['----------- Learning results -----------']),
                    FeatureComparession[label],
                    DF(index=['-------Selected Features Analysis-------']),
                    SelectedFeaturesComparession[label],
                    DF(index=['----------- Learning details -----------']),
                    DF.from_dict(s.Learningdetails, orient='index')
                ])
                if s.isDecompose:
                    resultsSum = concat([resultsSum, s.LabelComponents[label]])
                resultsSum.to_csv(saveName)

        if isBoolLabel:
            ResultsSummary = concat([
                DF(index=['------specificity vs. Number Of Features-------']),
                All_specificity,
                DF(index=['------sensitivity vs. Number Of Features-------']),
                All_sensitivity,
                DF(index=['------precision vs. Number Of Features-------']),
                All_precision,
                DF(index=['------accuracy vs. Number Of Features-------']),
                All_accuracy,
                DF(index=['------f1 vs. Number Of Features-------']), All_f1,
                DF(index=[
                    '------sensitivity-specificity mean vs. Number Of Features-------'
                ]), All_ss_mean
            ])
            ResultsSummary.to_csv(s.Learningdetails['saveDir'] +
                                  '\\ResultsSummary_bool.csv')
        else:
            ResultsSummary = concat([
                DF(index=['------train R^2 vs. Number Of Features-------']),
                All_trainR.dropna(),
                DF(index=['------train Pval vs. Number Of Features-------']),
                All_trainPval.dropna(),
                DF(index=['------train Error vs. Number Of Features-------']),
                All_trainErr.dropna(),
                DF(index=['------test R^2 vs. Number Of Features-------']),
                All_testR.dropna(),
                DF(index=['------testPval vs. Number Of Features-------']),
                All_testPval.dropna(),
                DF(index=[
                    '------test test Error vs. Number Of Features-------'
                ]),
                All_testErr.dropna(),
                DF(index=[
                    '------test Error STD vs. Number Of Features-------'
                ]),
                All_testErrStd.dropna(),
                DF(index=['------Label Range vs. Number Of Features-------']),
                All_LabelRange.dropna()
            ])
            ResultsSummary.to_csv(s.Learningdetails['saveDir'] +
                                  '\\ResultsSummary_regression.csv')

            # permutation test:
    """ #init

Example #26

0

Show file

import pandas as pd
from pandas import DataFrame

df_tennis = DataFrame.from_csv('tennis.csv')
print("\n Given Play Tennis Data Set:\n\n", df_tennis)

Example #27

0

Show file

        y = line.split(',')
        y.pop()  #pop off /r/n
        if not '' in y:
            formatted.writelines(line)
        else:
            continue
        i = i + 1
    except:
        break

fpr.close()
formatted.close()
#NOW FORMATTED AS CSV

#Extract csv into DataFrame
DF = df.from_csv(new, sep=',', index_col=None)

#read CSV and format
#fpr = open(filename,'r')
#fwp = open(outname,'w')
#Fin = csv.reader(fp)

#dex = ['Date','Time']
#DF.set_index(dex)

length = len(DF.index) / 3

DF3 = df(index=range(length), columns=out_columns)

for i in range(length):
    j = 3 * i

Example #28

0

Show file

File: knn.py Project: SoumayaJoshi/ML-lab-manual

"""
Created on Tue Sep 24 11:34:04 2019

@author: Soumaya
"""
import pandas as pd
from pandas import DataFrame

df_irisbd = DataFrame.from_csv(r"iris.data", header=None, index_col=None)
print(df_irisbd)
X = df_irisbd.iloc[:, :-1].values
y = df_irisbd.iloc[:, 4].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=100)
print(y_test)
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)

# Train the model using the training sets
model.fit(X_train, y_train)

predicted = model.predict(X_test)  # 0:Overcast, 2:Mild
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

Example #29

0

Show file

# coding: utf-8

import numpy as np
import pandas as pd
from pandas import DataFrame as df
from cylp.cy import CyClpSimplex
from cylp.py.modeling.CyLPModel import CyLPArray, CyLPModel
from Single_Year_Stage_II import Single_Year_Stage_II

# Import PUF, and results from Stage I

puf = pd.read_csv("/Users/Amy/Documents/puf.csv")
Stage_I_factors = df.from_csv("Stage_I_factors.csv", index_col=0)
Stage_II_targets = df.from_csv("Stage_II_targets.csv", index_col=0)

# all the final weights would be saved in z
length = len(puf.s006)
z = np.empty([length, 17])
z[:, 0] = puf.s006 / 100

# running LP solver for each year, with tolerance given in the tol argument
z[:, 1] = Single_Year_Stage_II(puf,
                               Stage_I_factors,
                               Stage_II_targets,
                               year='2009',
                               tol=0.24)

z[:, 2] = Single_Year_Stage_II(puf,
                               Stage_I_factors,
                               Stage_II_targets,
                               year='2010',

Example #30

0

Show file

File: LoVProperty.py Project: MaastrichtU-IDS/semantic-enhancement

import json
import requests
import pandas as pd
from pandas import DataFrame
#Search using column names in LOV
#save results to files
#the code need to be run for each dataframe
df = DataFrame.from_csv('geneData\\genes.tsv', sep='\t')  #pharmgkb
# df = DataFrame.from_csv('geneData\\hgnc_complete_subset.tsv', sep='\t') #hgnc
# df = DataFrame.from_csv('geneData\\CTD_chem_gene_ixns.tsv', sep='\t') #ctd
# df = pd.read_csv('geneData\\genage_human.csv')
cols = df.columns.values

#(saved_column)
# dfres = pd.DataFrame(columns = ['column_name', 'results', 'no_results', 'vocab_list'])
vocab_list = []
results_list = []
list_dict = []

for col in cols:
    # if "_" in col:
    #     col=col.replace("_"," ")
    parameters = {"q": col, "tag": "Biology", "type": "property"}
    # print(col)
    # Make a get request with the parameters.
    response = requests.get(
        "https://lov.linkeddata.es/dataset/lov/api/v2/term/search?",
        params=parameters)
    parsed = json.loads(response.content)
    # print(response.content)
    try:

Example #31

0

Show file

File: preprocess.py Project: juhuahuang/MLAPP-PP

def read_comment_from_file(path='..\data\\test'):
    comments_df = DataFrame.from_csv(path, header=0, sep='\t')
    comments_df.reset_index(inplace=True)
    return comments_df

Example #32

0

Show file

from pandas import DataFrame
import math



dfH = DataFrame.from_csv('geneData\\hgnccom.csv',index_col=None) #hgnc
dfg = DataFrame.from_csv('geneData\\genes.tsv', sep='\t',index_col=None) #pharmgkb
#for first approach the numbers from unique sum need to be eliminated
############################## loc from HGNC file

d1_l = dfH['location'].tolist()
d2_l=dfH['location_sortable'].tolist()
d1_lc =[]
d2_lc=[]
for i in d1_l:
    if i not in d1_lc:
        d1_lc.append(i)
for j in d2_l :
    if j not in d2_lc:
        d2_lc.append(j)
uniq=[]

for x in d1_lc:
    if x not in uniq:
        uniq.append(x)
for y in d2_lc:
    if y not in uniq:
        uniq.append(y)

# print(len(uniq))
# print(len(d1_lc))

Example #33

0

Show file

from nltk.stem import WordNetLemmatizer
from gensim import corpora, models, utils


# Fields in data set
# "id","comment_text","toxic","severe_toxic","obscene","threat","insult","identity_hate"


pd.options.display.float_format = '{:,.0f}'.format
regex = re.compile('[^a-zA-Z\']')
stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
Trainfile = "C:\\Kaggle\\train.txt"


df = DataFrame.from_csv(Trainfile, sep='\t', header=0,index_col=None)
comments = df.iloc[:,:8]
#print(df.info())


def perform_lsi(corpus, num_topic, dictionary):

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics= num_topic)
    print(lsi.print_topics(num_topics=num_topic, num_words=20))


def perform_lda(corpus, num_topic, dictionary):
    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topic)
    print(lda.print_topics(num_topics=num_topic, num_words=20))

Example #34

0

Show file

File: datasets.py Project: andymiller/GPy

def hapmap3(data_set='hapmap3'):
    try:
        from pandas import read_pickle, DataFrame
        from sys import stdout
        import bz2
    except ImportError as i:
        raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
    if not data_available(data_set):
        download_data(data_set)
    dirpath = os.path.join(data_path, 'hapmap3')
    hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
    preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \
                               ['.snps.pickle',
                                '.info.pickle',
                                '.nan.pickle']]
    if not reduce(lambda a, b: a and b,
                  map(os.path.exists, preprocessed_data_paths)):
        if not overide_manual_authorize and prompt_user(
                "Preprocessing requires 17GB "
                "of memory and can take a long time, continue? [Y/n]\n"):
            print "Preprocessing required for further usage."
            return
        status = "Preprocessing data, please be patient..."
        print status

        def write_status(message, progress, status):
            stdout.write(" " * len(status))
            stdout.write("\r")
            stdout.flush()
            status = r"[{perc: <{ll}}] {message: <13s}".format(
                message=message, ll=20, perc="=" * int(20. * progress / 100.))
            stdout.write(status)
            stdout.flush()
            return status

        unpacked_files = [
            os.path.join(dirpath, hapmap_file_name + ending)
            for ending in ['.ped', '.map']
        ]
        if not reduce(lambda a, b: a and b, map(os.path.exists,
                                                unpacked_files)):
            status = write_status('unpacking...', 0, '')
            curr = 0
            for newfilepath in unpacked_files:
                if not os.path.exists(newfilepath):
                    filepath = newfilepath + '.bz2'
                    file_size = os.path.getsize(filepath)
                    with open(newfilepath,
                              'wb') as new_file, open(filepath, 'rb') as f:
                        decomp = bz2.BZ2Decompressor()
                        file_processed = 0
                        buffsize = 100 * 1024
                        for data in iter(lambda: f.read(buffsize), b''):
                            new_file.write(decomp.decompress(data))
                            file_processed += len(data)
                            write_status(
                                'unpacking...',
                                curr + 12. * file_processed / (file_size),
                                status)
                curr += 12
                status = write_status('unpacking...', curr, status)
        status = write_status('reading .ped...', 25, status)
        # Preprocess data:
        snpstrnp = np.loadtxt('hapmap3_r2_b36_fwd.consensus.qc.poly.ped',
                              dtype=str)
        status = write_status('reading .map...', 33, status)
        mapnp = np.loadtxt('hapmap3_r2_b36_fwd.consensus.qc.poly.map',
                           dtype=str)
        status = write_status('reading relationships.txt...', 42, status)
        # and metainfo:
        infodf = DataFrame.from_csv('./relationships_w_pops_121708.txt',
                                    header=0,
                                    sep='\t')
        infodf.set_index('IID', inplace=1)
        status = write_status('filtering nan...', 45, status)
        snpstr = snpstrnp[:, 6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2)
        inan = snpstr[:, :, 0] == '0'
        status = write_status('filtering reference alleles...', 55, status)
        ref = np.array(
            map(lambda x: np.unique(x)[-2:],
                snpstr.swapaxes(0, 1)[:, :, :]))
        status = write_status('encoding snps...', 70, status)
        # Encode the information for each gene in {-1,0,1}:
        status = write_status('encoding snps...', 73, status)
        snps = (snpstr == ref[None, :, :])
        status = write_status('encoding snps...', 76, status)
        snps = (snps * np.array([1, -1])[None, None, :])
        status = write_status('encoding snps...', 78, status)
        snps = snps.sum(-1)
        status = write_status('encoding snps', 81, status)
        snps = snps.astype('S1')
        status = write_status('marking nan values...', 88, status)
        # put in nan values (masked as -128):
        snps[inan] = -128
        status = write_status('setting up meta...', 94, status)
        # get meta information:
        metaheader = np.r_[[
            'family_id', 'iid', 'paternal_id', 'maternal_id', 'sex',
            'phenotype'
        ]]
        metadf = DataFrame(columns=metaheader, data=snpstrnp[:, :6])
        metadf.set_index('iid', inplace=1)
        metadf = metadf.join(infodf.population)
        metadf.to_pickle(preprocessed_data_paths[1])
        # put everything together:
        status = write_status('setting up snps...', 96, status)
        snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:, 1])
        snpsdf.to_pickle(preprocessed_data_paths[0])
        status = write_status('setting up snps...', 98, status)
        inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:, 1])
        inandf.to_pickle(preprocessed_data_paths[2])
        status = write_status('done :)', 100, status)
        print ''
    else:
        print "loading snps..."
        snpsdf = read_pickle(preprocessed_data_paths[0])
        print "loading metainfo..."
        metadf = read_pickle(preprocessed_data_paths[1])
        print "loading nan entries..."
        inandf = read_pickle(preprocessed_data_paths[2])
    snps = snpsdf.values
    populations = metadf.population.values.astype('S3')
    hapmap = dict(name=data_set,
                  description='The HapMap phase three SNP dataset - '
                  '1184 samples out of 11 populations. inan is a '
                  'boolean array, containing wheather or not the '
                  'given entry is nan (nans are masked as '
                  '-128 in snps).',
                  snpsdf=snpsdf,
                  metadf=metadf,
                  snps=snps,
                  inan=inandf.values,
                  inandf=inandf,
                  populations=populations)
    return hapmap

Example #35

0

Show file

File: lab4.py Project: Deepika-S10/machine-learning-lab-15csl76

# -*- coding: utf-8 -*-
"""
Created on Thu Sep 26 14:30:20 2019

@author: Student
"""

import pandas as pd
from pandas import DataFrame

df_irisbd = DataFrame.from_csv(r"C:\\deepa\\iris.data",
                               header=None,
                               index_col=None)
print(df_irisbd)
X = df_irisbd.iloc[:, :-1].values
y = df_irisbd.iloc[:, 4].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=100)
print(y_test)
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)

# Train the model using the training sets
model.fit(X_train, y_train)

predicted = model.predict(X_test)  # 0:Overcast, 2:Mild
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, predicted))

Example #36

0

Show file

for key in mergeDataID:
    if key in clinicalCases:
        mergeDataID[key].update(clinicalCases[key])
    else:
        popID.append(key)

for i in popID:
    mergeDataID.pop(i, None)

test = list(mergeDataID["14313474-376f-4606-9ed9-25ed2acff411"].values())
test2 = list(mergeDataID["14313474-376f-4606-9ed9-25ed2acff411"].keys())

#Added Mutations and Genes

from pandas import DataFrame
df = DataFrame.from_csv("lungData.tsv", sep="\t")
dataDict = df.to_dict()

temp = dataDict["Mutations"]
mutDict = {
    k: {
        "simple_somatic_mutations": int(''.join(v.split(',')))
    }
    for k, v in temp.items()
}
temp = dataDict["Genes"]
genesDict = {
    k: {
        "genes_with_simple_somatic_mutations": int(''.join(v.split(',')))
    }
    for k, v in temp.items()

Example #37

0

Show file

try:
    from bs4 import BeautifulSoup
except ImportError:
    from BeautifulSoup import BeautifulSoup
import time, requests, urllib2, webbrowser, os, csv
import geocoder, math
from pandas import DataFrame

#&markers=color:blue%7Clabel:S%7C40.702147,-74.015794

df = DataFrame.from_csv('WEST BENGAL_data.csv', sep=',', parse_dates=False)
df.insert(11, 'ZIP', 'NA')
map_string_list = []
safe_markers = []
unsafe_markers = []
lat_long_list = []
safe_zips = []
unsafe_zips = []
final_string = ''
unsafe_info_window_string = []
safe_info_window_string = []

for index, row in df.iterrows():
    lat = row[8]
    lng = row[9]
    source = row[0]
    label = row[7]

    if lat == 'NA' and lng == 'NA':
        continue